├── CONTRIBUTING.md
├── DUC
    ├── eval.sh
    ├── make_DUC.py
    ├── make_rouge.py
    └── setup.sh
├── LICENSE
├── PATENTS
├── README.md
├── construct_data.sh
├── dataset
    ├── filter.py
    ├── make_dict.py
    ├── process_agiga.py
    ├── pull.py
    ├── small_train.splits
    ├── test.splits
    ├── train.splits
    └── valid.splits
├── prep_torch_data.sh
├── summary
    ├── beam_search.lua
    ├── build.lua
    ├── build_dict.lua
    ├── data.lua
    ├── encoder.lua
    ├── features.lua
    ├── nnlm.lua
    ├── run.lua
    ├── train.lua
    └── util.lua
├── test_model.sh
├── train_model.sh
└── tuning
    ├── SDecoder_cfg.txt
    ├── SDecoder_cmd.tpl
    ├── SDecoder_test.py
    ├── ZMERT_cfg.txt
    └── params.txt


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Neural Attention Model for Abstractive Summarization software
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Our Development Process
 6 | ... (in particular how this is synced with internal changes to the project)
 7 | 
 8 | ## Pull Requests
 9 | We actively welcome your pull requests.
10 | 
11 | 1. Fork the repo and create your branch from `master`.
12 | 2. If you've added code that should be tested, add tests
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## Coding Style
33 | * 2 spaces for indentation rather than tabs
34 | * 80 character line length
35 | * ...
36 | 
37 | ## License
38 | By contributing to Neural Attention Model for Abstractive Summarization, you agree that your contributions will be licensed
39 | under its BSD license.
40 | 


--------------------------------------------------------------------------------
/DUC/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd $1
 4 | rm -fr $1/tmp_GOLD
 5 | rm -fr $1/tmp_SYSTEM
 6 | rm -fr $1/tmp_OUTPUT
 7 | mkdir -p $1/tmp_GOLD
 8 | mkdir -p $1/tmp_SYSTEM
 9 | 
10 | python $ABS/DUC/make_rouge.py --base $1 --gold tmp_GOLD --system tmp_SYSTEM --input input.txt
11 | perl $ABS/DUC/prepare4rouge-simple.pl tmp_SYSTEM tmp_GOLD tmp_OUTPUT
12 | 
13 | cd tmp_OUTPUT
14 | export PERL5LIB=/data/users/sashar/summary/duc/RELEASE-1.5.5/
15 | 
16 | echo "FULL LENGTH"
17 | perl $ROUGE/ROUGE-1.5.5.pl -m -n 2 -w 1.2 -e $ROUGE -a settings.xml
18 | 
19 | 
20 | echo "LIMITED LENGTH"
21 | perl $ROUGE/ROUGE-1.5.5.pl -m -b 75 -n 2 -w 1.2 -e $ROUGE -a settings.xml
22 | 


--------------------------------------------------------------------------------
/DUC/make_DUC.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright (c) 2015, Facebook, Inc.
 3 | #  All rights reserved.
 4 | #
 5 | #  This source code is licensed under the BSD-style license found in the
 6 | #  LICENSE file in the root directory of this source tree. An additional grant
 7 | #  of patent rights can be found in the PATENTS file in the same directory.
 8 | #
 9 | #  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | #          Sumit Chopra <spchopra@fb.com>
11 | #          Jason Weston <jase@fb.com>
12 | 
13 | """Construct the DUC test set. """
14 | 
15 | import sys
16 | import argparse
17 | import glob
18 | import re
19 | import nltk.data
20 | from nltk.tokenize.treebank import TreebankWordTokenizer
21 | #@lint-avoid-python-3-compatibility-imports
22 | 
23 | sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
24 | tokenizer = TreebankWordTokenizer()
25 | def main(arguments):
26 | 
27 |     parser = argparse.ArgumentParser(description=__doc__,
28 |                                      formatter_class=
29 |                                      argparse.RawDescriptionHelpFormatter)
30 |     parser.add_argument('--sum_docs', help="Article directory.", type=str)
31 |     parser.add_argument('--year', help="DUC year to process.", type=str)
32 |     parser.add_argument('--result_docs', help="Reference directory.", type=str)
33 |     parser.add_argument('--ref_dir',
34 |                         help="Directory to output the references.", type=str)
35 |     parser.add_argument('--sys_dir',
36 |                         help="Directory to output the references.", type=str)
37 |     parser.add_argument('--article_file',
38 |                         help="File to output the article sentences..", type=str)
39 |     args = parser.parse_args(arguments)
40 | 
41 |     refs = [open("{0}/task1_ref{1}.txt".format(args.ref_dir, i), "w")
42 |             for i in range(4)]
43 |     article = open(args.article_file, "w")
44 |     prefix = open(args.sys_dir + "/task1_prefix.txt", "w")
45 |     if args.year == "2003":
46 |         files = glob.glob("{0}/*/*".format(args.sum_docs))
47 |     else:
48 |         files = glob.glob("{0}/*/*".format(args.sum_docs))
49 |     files.sort()
50 |     for f in files:
51 |         docset = f.split("/")[-2][:-1].upper()
52 |         name = f.split("/")[-1].upper()
53 | 
54 |         # Find references.
55 |         if args.year == "2003":
56 |             matches = list(glob.glob("{0}/{1}*.10.*{2}*".format(
57 |                 args.result_docs, docset, name)))
58 |         else:
59 |             matches = list(glob.glob("{0}/{1}*{2}*".format(
60 |                 args.result_docs, docset, name)))
61 |         matches.sort()
62 |         assert len(matches) == 4, matches
63 |         for i, m in enumerate(matches):
64 |             print >>refs[i], open(m).read().strip()
65 | 
66 |         # Make input.
67 |         mode = 0
68 |         text = ""
69 |         for l in open(f):
70 |             if l.strip() in ["</P>", "<P>"]:
71 |                 continue
72 |             if mode == 1 and l.strip() != "<P>":
73 |                 text += l.strip() + " "
74 |             if l.strip() == "<TEXT>":
75 |                 mode = 1
76 |         text = " ".join([w for w in text.split() if w[0] != "&"])
77 | 
78 |         sents = sent_detector.tokenize(text)
79 |         if len(sents) == 0:
80 |             print >>article
81 |             print >>prefix
82 |             continue
83 |         first = sents[0]
84 | 
85 |         # If the sentence is too short, add the second as well.
86 |         if len(sents[0]) < 130 and len(sents) > 1:
87 |             first = first.strip()[:-1] + " , " + sents[1]
88 | 
89 |         first = " ".join(tokenizer.tokenize(first.lower()))
90 |         if ")" in first or ("_" in first and args.year == "2003"):
91 |             first = re.split(" ((--)|-|_) ", first, 1)[-1]
92 |         first = first.replace("(", "-lrb-") \
93 |                      .replace(")", "-rrb-").replace("_", ",")
94 |         print >>article, first
95 |         print >>prefix, first[:75]
96 | if __name__ == '__main__':
97 |     sys.exit(main(sys.argv[1:]))
98 | 


--------------------------------------------------------------------------------
/DUC/make_rouge.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright (c) 2015, Facebook, Inc.
 3 | #  All rights reserved.
 4 | #
 5 | #  This source code is licensed under the BSD-style license found in the
 6 | #  LICENSE file in the root directory of this source tree. An additional grant
 7 | #  of patent rights can be found in the PATENTS file in the same directory.
 8 | #
 9 | #  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | #          Sumit Chopra <spchopra@fb.com>
11 | #          Jason Weston <jase@fb.com>
12 | 
13 | """Prep ROUGE eval. """
14 | 
15 | import sys
16 | import glob
17 | import os
18 | import argparse
19 | import itertools
20 | #@lint-avoid-python-3-compatibility-imports
21 | 
22 | parser = argparse.ArgumentParser(description=__doc__,
23 |                                  formatter_class=
24 |                                  argparse.RawDescriptionHelpFormatter)
25 | parser.add_argument('--base', help="Base directory.", type=str)
26 | parser.add_argument('--gold', help="Base directory.", type=str)
27 | parser.add_argument('--system', help="Base directory.", type=str)
28 | parser.add_argument('--input', help="Input text.", type=str)
29 | 
30 | args = parser.parse_args(sys.argv[1:])
31 | 
32 | for f in glob.glob("{0}/references/*".format(args.base)):
33 |     task, ref = f.split("/")[-1].split("_")
34 |     ref = int(ref.split(".")[0][-1])
35 | 
36 |     for i, l in enumerate(open(f)):
37 |         os.system("mkdir -p %s/%s%04d"%(args.gold, task, i))
38 |         with open("%s/%s%04d/%s%04d.%04d.gold" % (args.gold, task, i, task, i, ref), "w") as out:
39 |             print >>out, l.strip()
40 | 
41 | 
42 | for f in glob.glob("{0}/system/*".format(args.base)):
43 |     task, ref = f.split("/")[-1].split("_", 1)
44 |     #if ref.startswith("ducsystem"): continue
45 |     system = ref.split(".")[0]
46 |     os.system("mkdir -p %s/%s"%(args.system, system))
47 |     for i, (l, input_line) in enumerate(itertools.izip(open(f), open(args.input))):
48 |         words = []
49 |         numbers = dict([(len(w), w) for w in input_line.strip().split() if w[0].isdigit()])
50 |         for w in l.strip().split():
51 |             # Replace # with numbers from the input.
52 |             if w[0] == "#" and len(w) in numbers:
53 |                 words.append(numbers[len(w)])
54 |             elif w == "<s>":
55 |                 continue
56 |             else:
57 |                 words.append(w)
58 | 
59 |         with open("%s/%s/%s%04d.%s.system" % (args.system, system, task, i, system),"w") as out:
60 |             if words:
61 |                 print >>out, " ".join(words)
62 |             else:
63 |                 print >>out, "fail"
64 | 


--------------------------------------------------------------------------------
/DUC/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Step 1: Extracting DUC files"
 4 | cd $1
 5 | tar xvf DUC2003_Summarization_Documents.tgz
 6 | tar xvf DUC2004_Summarization_Documents.tgz
 7 | tar xvf duc2004_results.tgz
 8 | tar xvf detagged.duc2003.abstracts.tar.gz
 9 | 
10 | cd duc2004_results/ROUGE/; tar xvf duc2004.task1.ROUGE.models.tar.gz
11 | cd $1
12 | cd DUC2003_Summarization_Documents/duc2003_testdata/task1/; tar xvf task1.docs.tar.gz
13 | 
14 | 
15 | echo "Step 2: Make reference files."
16 | cd $1
17 | mkdir $1/clean_2004/
18 | mkdir $1/clean_2004/references
19 | mkdir $1/clean_2004/system
20 | python $ABS/DUC/make_DUC.py --result_docs duc2004_results/ROUGE/eval/models/1/ \
21 |     --sum_docs DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs/ \
22 |     --ref_dir clean_2004/references --year 2004 --article_file clean_2004/input.txt \
23 |     --sys_dir clean_2004/system
24 | 
25 | mkdir $1/clean_2003/
26 | mkdir $1/clean_2003/references
27 | mkdir $1/clean_2003/system
28 | python $ABS/DUC/make_DUC.py --result_docs detagged.duc2003.abstracts/models/ \
29 |     --sum_docs DUC2003_Summarization_Documents/duc2003_testdata/task1/docs.without.headlines/  \
30 |     --ref_dir clean_2003/references --year 2003 --article_file clean_2003/input.txt \
31 |     --sys_dir clean_2003/system
32 | 
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD License
 2 | 
 3 | For Neural Attention Model for Abstractive Summarization software
 4 | 
 5 | Copyright (c) 2015-present, Facebook, Inc. All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification,
 8 | are permitted provided that the following conditions are met:
 9 | 
10 |  * Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 | 
13 |  * Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 |  * Neither the name Facebook nor the names of its contributors may be used to
18 |    endorse or promote products derived from this software without specific
19 |    prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/PATENTS:
--------------------------------------------------------------------------------
 1 | Additional Grant of Patent Rights Version 2
 2 | 
 3 | "Software" means the Neural Attention Model for Abstractive Summarization software distributed by Facebook, Inc.
 4 | 
 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software
 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable
 7 | (subject to the termination provision below) license under any Necessary
 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise
 9 | transfer the Software. For avoidance of doubt, no license is granted under
10 | Facebook’s rights in any patent claims that are infringed by (i) modifications
11 | to the Software made by you or any third party or (ii) the Software in
12 | combination with any software or other technology.
13 | 
14 | The license granted hereunder will terminate, automatically and without notice,
15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate
16 | directly or indirectly, or take a direct financial interest in, any Patent
17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate
18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or
19 | in part from any software, technology, product or service of Facebook or any of
20 | its subsidiaries or corporate affiliates, or (iii) against any party relating
21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its
22 | subsidiaries or corporate affiliates files a lawsuit alleging patent
23 | infringement against you in the first instance, and you respond by filing a
24 | patent infringement counterclaim in that lawsuit against that party that is
25 | unrelated to the Software, the license granted hereunder will not terminate
26 | under section (i) of this paragraph due to such counterclaim.
27 | 
28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is
29 | necessarily infringed by the Software standing alone.
30 | 
31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
32 | or contributory infringement or inducement to infringe any patent, including a
33 | cross-claim or counterclaim.
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Attention-Based Summarization
  2 | 
  3 | This project contains the Abs. neural abstractive summarization system from the paper
  4 | 
  5 |      A Neural Attention Model for Abstractive Summarization.
  6 |      Alexander M. Rush, Sumit Chopra, Jason Weston.
  7 | 
  8 | The release includes code for:
  9 | 
 10 | * Extracting the summarization data set
 11 | * Training the neural summarization model
 12 | * Constructing evaluation sets with ROUGE
 13 | * Tuning extractive features
 14 | 
 15 | ## Setup
 16 | 
 17 | To run the system, you will need to have [Torch7](http://torch.ch/))
 18 | and [fbcunn](https://github.com/facebook/fbcunn) (Facebook's deep
 19 | learning library) installed. You will also need Python 2.7, NLTK, and
 20 | GNU Parallel to run the data processing scripts.  Additionally the
 21 | code currently requires a CUDA GPU for training and decoding.
 22 | 
 23 | Finally the scripts require that you set the $ABS environment variable.
 24 | 
 25 |     > export ABS=$PWD
 26 |     > export LUA_PATH="$LUA_PATH;$ABS/?.lua"
 27 | 
 28 | ## Constructing the Data Set
 29 | 
 30 | The model is trained to perform title generation from the first line
 31 | of newspaper articles. Since the system is completely data-driven it
 32 | requires a large set of aligned input-title pairs for training.
 33 | 
 34 | To provide these pairs we use the [Annotated Gigaword
 35 | corpus](https://catalog.ldc.upenn.edu/LDC2012T21) as our main data
 36 | set. The corpus is available on LDC, but it requires membership.  Once
 37 | the annotated gigaword is obtained, you can simply run the provided
 38 | script to extract the data set in text format.
 39 | 
 40 | ### Generating the data
 41 | 
 42 | To construct the data set run the following script to produce `working_dir/`,
 43 | where `working_dir/' is the path to the directory where you want to store the
 44 | processed data. The script 'construct_data.sh' makes use of the 'parallel'
 45 | utility, so please make sure that it is in your path.
 46 | WARNING: This may take a couple hours to run.
 47 | 
 48 |      > ./construct_data.sh agiga/ working_dir/
 49 | 
 50 | ### Format of the data files
 51 | 
 52 | The above command builds aligned files of the form split.type.txt where split
 53 | is train/valid/test and type is title/article.
 54 | 
 55 | The output of the script is several aligned plain-text files.
 56 | Each has one title or article per line.
 57 | 
 58 |      > head train.title.txt
 59 |      australian current account deficit narrows sharply
 60 |      at least two dead in southern philippines blast
 61 |      australian stocks close down #.# percent
 62 |      envoy urges north korea to restart nuclear disablement
 63 |      skorea announces tax cuts to stimulate economy
 64 | 
 65 | These files can be used to train the ABS system or be used by other baseline models.
 66 | 
 67 | ## Training the Model
 68 | 
 69 | Once the data set has been constructed, we provide a simple script to train
 70 | the model.
 71 | 
 72 |    > ./train_model.sh working_dir/ model.th
 73 | 
 74 | 
 75 | The training process consists of two stages. First we convert the text
 76 | files into generic input-title matrices and then we train a
 77 | conditional NNLM on this representation.
 78 | 
 79 | Once the model has been fully trained (this may require 3-4 days),
 80 | you can use the test script to produce summaries of any plain text file.w
 81 | 
 82 |    > ./test_model.sh working_dir/valid.article.filter.txt model.th length_of_summary
 83 | 
 84 | 
 85 | ### Training options
 86 | 
 87 | These scripts utilize the Torch code available in `$ABS/summary/`
 88 | 
 89 | There are two main torch entry points. One for training the model
 90 | from data matrices and the other for evaluating the model on plain-text.
 91 | 
 92 |      > th summary/train.lua -help
 93 | 
 94 |      Train a summarization model.
 95 | 
 96 |        -articleDir      Directory containing article training matrices. []
 97 |        -titleDir        Directory containing title training matrices. []
 98 |        -validArticleDir Directory containing article matricess for validation. []
 99 |        -validTitleDir   Directory containing title matrices for validation. []
100 |        -auxModel        The encoder model to use. [bow]
101 |        -bowDim          Article embedding size. [50]
102 |        -attenPool       Attention model pooling size. [5]
103 |        -hiddenUnits     Conv net encoder hidden units. [1000]
104 |        -kernelWidth     Conv net encoder kernel width. [5]
105 |        -epochs          Number of epochs to train. [5]
106 |        -miniBatchSize   Size of training minibatch. [64]
107 |        -printEvery      How often to print during training. [1000]
108 |        -modelFilename   File for saving loading/model. []
109 |        -window          Size of NNLM window. [5]
110 |        -embeddingDim    Size of NNLM embeddings. [50]
111 |        -hiddenSize      Size of NNLM hidden layer. [100]
112 |        -learningRate    SGD learning rate. [0.1]
113 | 
114 | 
115 | 
116 | ### Testing options
117 | 
118 | 
119 | The run script is used for beam-search decoding with a trained
120 | model. See the paper for a description of the extractive
121 | features used at decoding time.
122 | 
123 |     > th summary/run.lua -help
124 | 
125 |     -blockRepeatWords Disallow generating a repeated word. [false]
126 |     -allowUNK         Allow generating <unk>. [false]
127 |     -fixedLength      Produce exactly -length words. [true]
128 |     -lmWeight         Weight for main model. [1]
129 |     -beamSize         Size of the beam. [100]
130 |     -extractive       Force fully extractive summary. [false]
131 |     -lmWeight         Feature weight for the neural model. [1]
132 |     -unigramBonus     Feature weight for unigram extraction. [0]
133 |     -bigramBonus      Feature weight for bigram extraction. [0]
134 |     -trigramBonus     Feature weight for trigram extraction. [0]
135 |     -lengthBonus      Feature weight for length. [0]
136 |     -unorderBonus     Feature weight for out-of-order extraction. [0]
137 |     -modelFilename    Model to test. []
138 |     -inputf           Input article files.  []
139 |     -nbest            Write out the nbest list in ZMert format. [false]
140 |     -length           Maximum length of summary.. [5]
141 | 
142 | 
143 | 
144 | ## Evaluation Data Sets
145 | 
146 | We evaluate the ABS model using the shared task from the Document Understanding Conference (DUC).
147 | 
148 | This release also includes code for interactive with the DUC shared
149 | task on headline generation. The scripts for processing and evaluating
150 | on this data set are in the DUC/ directory.
151 | 
152 | The [DUC data set](http://duc.nist.gov/duc2004/tasks.html) is
153 | available online, unfortunately you must manually fill out a form to
154 | request the data from NIST.  Send the request to
155 | [Angela Ellis](mailto:angela.ellis@nist.gov).
156 | 
157 | ### Processing DUC
158 | 
159 | After receiving credentials you should obtain a series of
160 | tar files containing the data used as part of this shared task.
161 | 
162 | 1. Make a directory DUC_data/ which should contain the given files
163 | 
164 | 
165 |        >DUC2003\_Summarization\_Documents.tgz
166 |        >DUC2004\_Summarization\_Documents.tgz
167 |        >duc2004\_results.tgz
168 |        >detagged.duc2003.abstracts.tar.gz
169 | 
170 | 2. Run the setup script (this requires python and NLTK for tokenization)
171 | 
172 | 
173 |       > ./DUC/setup.sh DUC_data/
174 | 
175 | 
176 | After running the scripts there should be directories
177 | 
178 |        DUC_data/clean_2003/
179 |        DUC_data/clean_2004/
180 | 
181 | 
182 | Each contains a file input.txt where each line is a tokenized first line of an article.
183 | 
184 | 
185 |      > head DUC_data/clean_2003/input.txt
186 |      schizophrenia patients whose medication could n't stop the imaginary voices in their heads gained some relief after researchers repeatedly sent a magnetic field into a small area of their brains .
187 |      scientists trying to fathom the mystery of schizophrenia say they have found the strongest evidence to date that the disabling psychiatric disorder is caused by gene abnormalities , according to a researcher at two state universities .
188 |      a yale school of medicine study is expanding upon what scientists know  about the link between schizophrenia and nicotine addiction .
189 |      exploring chaos in a search for order , scientists who study the reality-shattering mental disease schizophrenia are becoming fascinated by the chemical environment of areas of the brain where perception is regulated .
190 | 
191 | As well as a set of references:
192 | 
193 | 
194 |     > head DUC_data/clean_2003/references/task1_ref0.txt
195 |     Magnetic treatment may ease or lessen occurrence of schizophrenic voices.
196 |     Evidence shows schizophrenia caused by gene abnormalities of Chromosome 1.
197 |     Researchers examining evidence of link between schizophrenia and nicotine addiction.
198 |     Scientists focusing on chemical environment of brain to understand schizophrenia.
199 |     Schizophrenia study shows disparity between what's known and what's provided to patients.
200 | 
201 | System output should be added to the directory system/task1_{name}.txt. For instance the script includes a baseline PREFIX system.
202 | 
203 | 
204 |     DUC_data/clean_2003/references/task1_prefix.txt
205 | 
206 | 
207 | ### ROUGE for Eval
208 | 
209 | To evaluate the summaries you will need the [ROUGE eval system](http://research.microsoft.com/~cyl/download/ROUGE-1.5.5.tgz).
210 | 
211 | The ROUGE script requires output in a very complex HTML form.
212 | To simplify this process we include a script to convert the
213 | simple output to one that ROUGE can handle.
214 | 
215 | Export the ROUGE directory `export ROUGE={path_to_rouge}` and then run the eval scripts
216 | 
217 | 
218 |     > ./DUC/eval.sh DUC_data/clean_2003/
219 |     FULL LENGTH
220 |        ---------------------------------------------
221 |        prefix ROUGE-1 Average_R: 0.17831 (95%-conf.int. 0.16916 - 0.18736)
222 |        prefix ROUGE-1 Average_P: 0.15445 (95%-conf.int. 0.14683 - 0.16220)
223 |        prefix ROUGE-1 Average_F: 0.16482 (95%-conf.int. 0.15662 - 0.17318)
224 |        ---------------------------------------------
225 |        prefix ROUGE-2 Average_R: 0.04936 (95%-conf.int. 0.04420 - 0.05452)
226 |        prefix ROUGE-2 Average_P: 0.04257 (95%-conf.int. 0.03794 - 0.04710)
227 |        prefix ROUGE-2 Average_F: 0.04550 (95%-conf.int. 0.04060 - 0.05026)
228 | 
229 | 
230 | ## Tuning Feature Weights
231 | 
232 | For our system ABS+ we additionally tune extractive features on the DUC
233 | summarization data. The final features we obtained our distributed with the
234 | system as `tuning/params.best.txt`.
235 | 
236 | The MERT tuning code itself is located in the `tuning/` directory. Our setup
237 | uses [ZMert](http://cs.jhu.edu/~ozaidan/zmert/) for this process.
238 | 
239 | It should be straightforward to tune the system on any developments
240 | summarization data. Take the following steps to run tuning on the
241 | DUC-2003 data set described above.
242 | 
243 | First copy over reference files to the tuning directoy. For instance to tune on DUC-2003:
244 | 
245 |     ln -s DUC_data/clean_2003/references/task1_ref0.txt tuning/ref.0
246 |     ln -s DUC_data/clean_2003/references/task1_ref1.txt tuning/ref.1
247 |     ln -s DUC_data/clean_2003/references/task1_ref2.txt tuning/ref.2
248 |     ln -s DUC_data/clean_2003/references/task1_ref3.txt tuning/ref.3
249 | 
250 | Next copy the SDecoder template, `cp SDecoder_cmd.tpl SDecoder_cmd.py`
251 | and modify the `SDecoder_cmd.py` to point to the model and input text.
252 | 
253 |     {"model" : "model.th",
254 |      "src" : "/data/users/sashar/DUC_data/clean_2003/input.txt",
255 |      "title_len" : 14}
256 | 
257 | 
258 | Now you should be able to run Z-MERT and let it do its thing.
259 | 
260 |     > cd tuning/; java -cp zmert/lib/zmert.jar ZMERT ZMERT_cfg.txt
261 | 
262 | When Z-MERT has finished you can run on new data using command:
263 | 
264 |     > python SDecoder_test.py input.txt model.th
265 | 


--------------------------------------------------------------------------------
/construct_data.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | export AGIGA=$1
 4 | export WORK=$2
 5 | export THREADS=30
 6 | export SCRIPTS=$ABS/dataset
 7 | export SPLITS=$ABS/dataset
 8 | export UNK=5
 9 | 
10 | echo "Step 1: Construct the title-article pairs from gigaword"
11 | mkdir -p $WORK
12 | find $AGIGA/???/*.xml.gz | parallel --gnu --progress -j $THREADS python2.7 $SCRIPTS/process_agiga.py \{\} $WORK
13 | 
14 | 
15 | echo "Step 2: Compile the data into train/dev/test."
16 | cd $WORK
17 | cat $SPLITS/train.splits | xargs cat > train.data.txt
18 | cat $SPLITS/valid.splits | xargs cat > valid.data.txt
19 | cat $SPLITS/test.splits  | xargs cat > test.data.txt
20 | 
21 | 
22 | echo "Step 3: Basic filtering on train/dev."
23 | python2.7 $SCRIPTS/filter.py train.data.txt > train.data.filter.txt
24 | python2.7 $SCRIPTS/filter.py valid.data.txt > valid.data.filter.txt
25 | 
26 | 
27 | echo "Step 4: Compile dictionary."
28 | python2.7 $SCRIPTS/make_dict.py $WORK/train.data.filter.txt  $WORK/train $UNK
29 | 
30 | 
31 | echo "Step 5: Construct title-article files."
32 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict   < $WORK/train.data.filter.txt > $WORK/train.title.txt
33 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/train.data.filter.txt > $WORK/train.article.txt
34 | 
35 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict   < $WORK/valid.data.txt > $WORK/valid.title.txt
36 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/valid.data.txt > $WORK/valid.article.txt
37 | 
38 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict   < $WORK/valid.data.filter.txt > $WORK/valid.title.filter.txt
39 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/valid.data.filter.txt > $WORK/valid.article.filter.txt
40 | 
41 | python2.7 $SCRIPTS/pull.py trg_lc $WORK/train.title.dict   < $WORK/test.data.txt > $WORK/test.title.txt
42 | python2.7 $SCRIPTS/pull.py src_lc $WORK/train.article.dict < $WORK/test.data.txt > $WORK/test.article.txt
43 | 
44 | 
45 | echo "Step 6: Constructing torch data files."
46 | bash $ABS/prep_torch_data.sh $WORK
47 | 


--------------------------------------------------------------------------------
/dataset/filter.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright (c) 2015, Facebook, Inc.
 3 | #  All rights reserved.
 4 | #
 5 | #  This source code is licensed under the BSD-style license found in the
 6 | #  LICENSE file in the root directory of this source tree. An additional grant
 7 | #  of patent rights can be found in the PATENTS file in the same directory.
 8 | #
 9 | #  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | #          Sumit Chopra <spchopra@fb.com>
11 | #          Jason Weston <jase@fb.com>
12 | 
13 | import sys
14 | #@lint-avoid-python-3-compatibility-imports
15 | 
16 | def get_words(parse):
17 |     return [w.strip(")")
18 |             for w in parse.split()
19 |             if w[-1] == ')']
20 | 
21 | for l in open(sys.argv[1]):
22 |     splits = l.strip().split("\t")
23 |     if len(splits) != 4:
24 |         continue
25 |     title_parse, article_parse, title, article = splits
26 |     title_words = title.split()
27 |     article_words = article.split()
28 | 
29 |     # No blanks.
30 |     if any((word == "" for word in title_words)):
31 |         continue
32 | 
33 |     if any((word == "" for word in article_words)):
34 |         continue
35 | 
36 |     if not any((word == "." for word in article_words)):
37 |         continue
38 | 
39 |     # Spurious words to blacklist.
40 |     # First set is words that never appear in input and output
41 |     # Second set is punctuation and non-title words.
42 |     bad_words = ['update#', 'update', 'recasts', 'undated', 'grafs', 'corrects',
43 |                  'retransmitting', 'updates', 'dateline', 'writethru',
44 |                  'recaps', 'inserts', 'incorporates', 'adv##',
45 |                  'ld-writethru', 'djlfx', 'edits', 'byline',
46 |                  'repetition', 'background', 'thruout', 'quotes',
47 |                  'attention', 'ny###', 'overline', 'embargoed', 'ap', 'gmt',
48 |                  'adds', 'embargo',
49 |                  'urgent', '?', ' i ', ' : ', ' - ', ' by ', '-lrb-', '-rrb-']
50 |     if any((bad in title.lower()
51 |             for bad in bad_words)):
52 |         continue
53 | 
54 |     # Reasonable lengths
55 |     if not (10 < len(article_words) < 100 and
56 |             3 < len(title_words) < 50):
57 |         continue
58 | 
59 |     # Some word match.
60 |     matches = len(set([w.lower() for w in title_words if len(w) > 3]) &
61 |                   set([w.lower() for w in article_words if len(w) > 3]))
62 |     if matches < 1:
63 |         continue
64 | 
65 |     # Okay, print.
66 |     print(l.strip())
67 | 


--------------------------------------------------------------------------------
/dataset/make_dict.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright (c) 2015, Facebook, Inc.
 3 | #  All rights reserved.
 4 | #
 5 | #  This source code is licensed under the BSD-style license found in the
 6 | #  LICENSE file in the root directory of this source tree. An additional grant
 7 | #  of patent rights can be found in the PATENTS file in the same directory.
 8 | #
 9 | #  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | #          Sumit Chopra <spchopra@fb.com>
11 | #          Jason Weston <jase@fb.com>
12 | 
13 | import sys
14 | from collections import Counter
15 | #@lint-avoid-python-3-compatibility-imports
16 | 
17 | title_words = Counter()
18 | article_words = Counter()
19 | limit = int(sys.argv[3])
20 | 
21 | for l in open(sys.argv[1]):
22 |     splits = l.strip().split("\t")
23 |     if len(splits) != 4:
24 |         continue
25 |     title_parse, article_parse, title, article = l.strip().split("\t")
26 |     title_words.update(title.lower().split())
27 |     article_words.update(article.lower().split())
28 | 
29 | with open(sys.argv[2] + ".article.dict", "w") as f:
30 |     print >>f, "<unk>", 1e5
31 |     print >>f, "<s>", 1e5
32 |     print >>f, "</s>", 1e5
33 |     for word, count in article_words.most_common():
34 |         if count < limit:
35 |             break
36 |         print >>f, word, count
37 | 
38 | with open(sys.argv[2] + ".title.dict", "w") as f:
39 |     print >>f, "<unk>", 1e5
40 |     print >>f, "<s>", 1e5
41 |     print >>f, "</s>", 1e5
42 |     for word, count in title_words.most_common():
43 |         if count < limit:
44 |             break
45 |         print >>f, word, count
46 | 


--------------------------------------------------------------------------------
/dataset/process_agiga.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright (c) 2015, Facebook, Inc.
 3 | #  All rights reserved.
 4 | #
 5 | #  This source code is licensed under the BSD-style license found in the
 6 | #  LICENSE file in the root directory of this source tree. An additional grant
 7 | #  of patent rights can be found in the PATENTS file in the same directory.
 8 | #
 9 | #  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | #          Sumit Chopra <spchopra@fb.com>
11 | #          Jason Weston <jase@fb.com>
12 | 
13 | #/usr/bin/env python
14 | 
15 | import sys
16 | import os
17 | import re
18 | import gzip
19 | #@lint-avoid-python-3-compatibility-imports
20 | 
21 | # Make directory for output if it doesn't exist
22 | 
23 | try:
24 |     os.mkdir(sys.argv[2] + "/" + sys.argv[1].split("/")[-2])
25 | except OSError:
26 |     pass
27 | 
28 | # Strip off .gz ending
29 | end = "/".join(sys.argv[1].split("/")[-2:])[:-len(".xml.gz")] + ".txt"
30 | 
31 | out = open(sys.argv[2] + end, "w")
32 | 
33 | # Parse and print titles and articles
34 | NONE, HEAD, NEXT, TEXT = 0, 1, 2, 3
35 | MODE = NONE
36 | title_parse = ""
37 | article_parse = []
38 | 
39 | # FIX: Some parses are mis-parenthesized.
40 | def fix_paren(parse):
41 |     if len(parse) < 2:
42 |         return parse
43 |     if parse[0] == "(" and parse[1] == " ":
44 |         return parse[2:-1]
45 |     return parse
46 | 
47 | def get_words(parse):
48 |     words = []
49 |     for w in parse.split():
50 |         if w[-1] == ')':
51 |             words.append(w.strip(")"))
52 |             if words[-1] == ".":
53 |                 break
54 |     return words
55 | 
56 | def remove_digits(parse):
57 |     return re.sub(r'\d', '#', parse)
58 | 
59 | for l in gzip.open(sys.argv[1]):
60 |     if MODE == HEAD:
61 |         title_parse = remove_digits(fix_paren(l.strip()))
62 |         MODE = NEXT
63 | 
64 |     if MODE == TEXT:
65 |         article_parse.append(remove_digits(fix_paren(l.strip())))
66 | 
67 |     if MODE == NONE and l.strip() == "<HEADLINE>":
68 |         MODE = HEAD
69 | 
70 |     if MODE == NEXT and l.strip() == "<P>":
71 |         MODE = TEXT
72 | 
73 |     if MODE == TEXT and l.strip() == "</P>":
74 |         articles = []
75 |         # Annotated gigaword has a poor sentence segmenter.
76 |         # Ensure there is a least a period.
77 | 
78 |         for i in range(len(article_parse)):
79 |             articles.append(article_parse[i])
80 |             if "(. .)" in article_parse[i]:
81 |                 break
82 | 
83 |         article_parse = "(TOP " + " ".join(articles) + ")"
84 | 
85 |         # title_parse \t article_parse \t title \t article
86 |         print >>out, "\t".join([title_parse, article_parse,
87 |                                 " ".join(get_words(title_parse)),
88 |                                 " ".join(get_words(article_parse))])
89 |         article_parse = []
90 |         MODE = NONE
91 | 


--------------------------------------------------------------------------------
/dataset/pull.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright (c) 2015, Facebook, Inc.
 3 | #  All rights reserved.
 4 | #
 5 | #  This source code is licensed under the BSD-style license found in the
 6 | #  LICENSE file in the root directory of this source tree. An additional grant
 7 | #  of patent rights can be found in the PATENTS file in the same directory.
 8 | #
 9 | #  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | #          Sumit Chopra <spchopra@fb.com>
11 | #          Jason Weston <jase@fb.com>
12 | 
13 | """
14 | Pull out elements of the title-article file.
15 | """
16 | import sys
17 | #@lint-avoid-python-3-compatibility-imports
18 | 
19 | words_dict = set([l.split()[0]
20 |                   for l in open(sys.argv[2])])
21 | 
22 | for l in sys.stdin:
23 |     splits = l.strip().split("\t")
24 |     if len(splits) != 4:
25 |         continue
26 |     title_parse, article_parse, title, article = l.strip().split("\t")
27 |     if sys.argv[1] == "src":
28 |         print(article)
29 |     elif sys.argv[1] == "trg":
30 |         print(title)
31 |     elif sys.argv[1] == "src_lc":
32 |         words = [w if w in words_dict else "<unk>"
33 |                  for w in article.lower().split()]
34 |         print(" ".join(words))
35 |     elif sys.argv[1] == "trg_lc":
36 |         t = title.lower()
37 |         words = [w if w in words_dict else "<unk>"
38 |                  for w in t.split()
39 |                  if w not in ['"', "'", "''", "!", "=", "-",
40 |                               "--", ",", "?", ".",
41 |                               "``", "`", "-rrb-", "-llb-", "\\/"]]
42 |         print(" ".join(words))
43 |     elif sys.argv[1] == "srctree":
44 |         print(article_parse)
45 |     elif sys.argv[1] == "interleave":
46 |         # Format needed for T3
47 |         print(article_parse)
48 |         print(title_parse)
49 | 


--------------------------------------------------------------------------------
/dataset/small_train.splits:
--------------------------------------------------------------------------------
 1 | AFP/afp_eng_201004.txt
 2 | AFP/afp_eng_200212.txt
 3 | AFP/afp_eng_200401.txt
 4 | AFP/afp_eng_199508.txt
 5 | AFP/afp_eng_200610.txt
 6 | AFP/afp_eng_201007.txt
 7 | APW/apw_eng_200105.txt
 8 | APW/apw_eng_200408.txt
 9 | APW/apw_eng_201001.txt
10 | APW/apw_eng_200906.txt
11 | APW/apw_eng_200606.txt
12 | APW/apw_eng_200211.txt
13 | APW/apw_eng_200512.txt
14 | APW/apw_eng_200505.txt
15 | CNA/cna_eng_199910.txt
16 | CNA/cna_eng_199905.txt
17 | CNA/cna_eng_200905.txt
18 | CNA/cna_eng_200101.txt
19 | CNA/cna_eng_200105.txt
20 | CNA/cna_eng_200201.txt
21 | LTW/ltw_eng_199806.txt
22 | LTW/ltw_eng_200702.txt
23 | LTW/ltw_eng_200607.txt
24 | LTW/ltw_eng_200708.txt
25 | LTW/ltw_eng_200501.txt
26 | NYT/nyt_eng_200807.txt
27 | NYT/nyt_eng_200612.txt
28 | NYT/nyt_eng_199608.txt
29 | NYT/nyt_eng_200106.txt
30 | NYT/nyt_eng_200311.txt
31 | NYT/nyt_eng_200702.txt
32 | NYT/nyt_eng_201007.txt
33 | NYT/nyt_eng_200212.txt
34 | XIN/xin_eng_199506.txt
35 | XIN/xin_eng_200311.txt
36 | XIN/xin_eng_199703.txt
37 | XIN/xin_eng_200305.txt
38 | XIN/xin_eng_199808.txt
39 | XIN/xin_eng_199609.txt
40 | XIN/xin_eng_200109.txt
41 | XIN/xin_eng_200706.txt
42 | 


--------------------------------------------------------------------------------
/dataset/test.splits:
--------------------------------------------------------------------------------
 1 | AFP/afp_eng_199511.txt
 2 | AFP/afp_eng_200606.txt
 3 | AFP/afp_eng_199703.txt
 4 | AFP/afp_eng_200811.txt
 5 | AFP/afp_eng_199604.txt
 6 | AFP/afp_eng_200704.txt
 7 | AFP/afp_eng_200701.txt
 8 | APW/apw_eng_200412.txt
 9 | APW/apw_eng_200908.txt
10 | APW/apw_eng_199605.txt
11 | APW/apw_eng_200305.txt
12 | APW/apw_eng_200506.txt
13 | APW/apw_eng_199608.txt
14 | APW/apw_eng_199808.txt
15 | APW/apw_eng_200708.txt
16 | APW/apw_eng_199707.txt
17 | CNA/cna_eng_200910.txt
18 | CNA/cna_eng_200103.txt
19 | CNA/cna_eng_200308.txt
20 | CNA/cna_eng_200904.txt
21 | CNA/cna_eng_201012.txt
22 | CNA/cna_eng_201007.txt
23 | CNA/cna_eng_200112.txt
24 | LTW/ltw_eng_200605.txt
25 | LTW/ltw_eng_200608.txt
26 | LTW/ltw_eng_200312.txt
27 | LTW/ltw_eng_200906.txt
28 | LTW/ltw_eng_200606.txt
29 | LTW/ltw_eng_200805.txt
30 | NYT/nyt_eng_201005.txt
31 | NYT/nyt_eng_200305.txt
32 | NYT/nyt_eng_200505.txt
33 | NYT/nyt_eng_199612.txt
34 | NYT/nyt_eng_199910.txt
35 | NYT/nyt_eng_199809.txt
36 | NYT/nyt_eng_201004.txt
37 | NYT/nyt_eng_200808.txt
38 | NYT/nyt_eng_200601.txt
39 | XIN/xin_eng_199704.txt
40 | XIN/xin_eng_200310.txt
41 | XIN/xin_eng_200711.txt
42 | XIN/xin_eng_200804.txt
43 | XIN/xin_eng_200902.txt
44 | XIN/xin_eng_200106.txt
45 | XIN/xin_eng_199802.txt
46 | XIN/xin_eng_200411.txt
47 | XIN/xin_eng_200511.txt
48 | 


--------------------------------------------------------------------------------
/dataset/train.splits:
--------------------------------------------------------------------------------
  1 | AFP/afp_eng_200809.txt
  2 | AFP/afp_eng_199412.txt
  3 | AFP/afp_eng_200311.txt
  4 | AFP/afp_eng_199512.txt
  5 | AFP/afp_eng_200203.txt
  6 | AFP/afp_eng_200204.txt
  7 | AFP/afp_eng_200608.txt
  8 | AFP/afp_eng_200509.txt
  9 | AFP/afp_eng_200410.txt
 10 | AFP/afp_eng_200405.txt
 11 | AFP/afp_eng_200211.txt
 12 | AFP/afp_eng_200205.txt
 13 | AFP/afp_eng_199405.txt
 14 | AFP/afp_eng_199510.txt
 15 | AFP/afp_eng_199611.txt
 16 | AFP/afp_eng_199612.txt
 17 | AFP/afp_eng_200907.txt
 18 | AFP/afp_eng_200412.txt
 19 | AFP/afp_eng_201002.txt
 20 | AFP/afp_eng_200910.txt
 21 | AFP/afp_eng_199504.txt
 22 | AFP/afp_eng_200207.txt
 23 | AFP/afp_eng_199501.txt
 24 | AFP/afp_eng_200812.txt
 25 | AFP/afp_eng_200307.txt
 26 | AFP/afp_eng_199608.txt
 27 | AFP/afp_eng_200303.txt
 28 | AFP/afp_eng_200304.txt
 29 | AFP/afp_eng_199409.txt
 30 | AFP/afp_eng_200202.txt
 31 | AFP/afp_eng_199610.txt
 32 | AFP/afp_eng_199503.txt
 33 | AFP/afp_eng_200904.txt
 34 | AFP/afp_eng_200212.txt
 35 | AFP/afp_eng_201010.txt
 36 | AFP/afp_eng_200901.txt
 37 | AFP/afp_eng_200702.txt
 38 | AFP/afp_eng_199609.txt
 39 | AFP/afp_eng_200806.txt
 40 | AFP/afp_eng_200805.txt
 41 | AFP/afp_eng_200408.txt
 42 | AFP/afp_eng_200611.txt
 43 | AFP/afp_eng_201012.txt
 44 | AFP/afp_eng_200501.txt
 45 | AFP/afp_eng_200706.txt
 46 | AFP/afp_eng_200505.txt
 47 | AFP/afp_eng_199602.txt
 48 | AFP/afp_eng_199601.txt
 49 | AFP/afp_eng_200607.txt
 50 | AFP/afp_eng_200404.txt
 51 | AFP/afp_eng_200406.txt
 52 | AFP/afp_eng_200912.txt
 53 | AFP/afp_eng_200306.txt
 54 | AFP/afp_eng_200312.txt
 55 | AFP/afp_eng_199506.txt
 56 | AFP/afp_eng_199701.txt
 57 | AFP/afp_eng_199505.txt
 58 | AFP/afp_eng_199606.txt
 59 | AFP/afp_eng_200512.txt
 60 | AFP/afp_eng_200711.txt
 61 | AFP/afp_eng_200603.txt
 62 | AFP/afp_eng_200504.txt
 63 | AFP/afp_eng_200310.txt
 64 | AFP/afp_eng_200209.txt
 65 | AFP/afp_eng_199411.txt
 66 | AFP/afp_eng_199509.txt
 67 | AFP/afp_eng_200903.txt
 68 | AFP/afp_eng_200707.txt
 69 | AFP/afp_eng_200705.txt
 70 | AFP/afp_eng_199603.txt
 71 | AFP/afp_eng_200112.txt
 72 | AFP/afp_eng_200502.txt
 73 | AFP/afp_eng_200508.txt
 74 | AFP/afp_eng_200403.txt
 75 | AFP/afp_eng_199705.txt
 76 | AFP/afp_eng_200908.txt
 77 | AFP/afp_eng_200206.txt
 78 | AFP/afp_eng_200906.txt
 79 | AFP/afp_eng_199507.txt
 80 | AFP/afp_eng_201001.txt
 81 | AFP/afp_eng_199407.txt
 82 | AFP/afp_eng_201004.txt
 83 | AFP/afp_eng_200208.txt
 84 | AFP/afp_eng_200902.txt
 85 | AFP/afp_eng_200710.txt
 86 | AFP/afp_eng_200503.txt
 87 | AFP/afp_eng_200905.txt
 88 | AFP/afp_eng_200712.txt
 89 | AFP/afp_eng_200402.txt
 90 | AFP/afp_eng_200807.txt
 91 | AFP/afp_eng_200804.txt
 92 | AFP/afp_eng_201006.txt
 93 | AFP/afp_eng_200511.txt
 94 | AFP/afp_eng_200802.txt
 95 | AFP/afp_eng_201008.txt
 96 | AFP/afp_eng_200309.txt
 97 | AFP/afp_eng_200301.txt
 98 | AFP/afp_eng_200612.txt
 99 | AFP/afp_eng_199704.txt
100 | AFP/afp_eng_200604.txt
101 | AFP/afp_eng_199410.txt
102 | AFP/afp_eng_200911.txt
103 | AFP/afp_eng_200510.txt
104 | AFP/afp_eng_200803.txt
105 | AFP/afp_eng_201009.txt
106 | AFP/afp_eng_200810.txt
107 | AFP/afp_eng_200610.txt
108 | AFP/afp_eng_200507.txt
109 | AFP/afp_eng_200708.txt
110 | AFP/afp_eng_200201.txt
111 | AFP/afp_eng_200801.txt
112 | AFP/afp_eng_200407.txt
113 | AFP/afp_eng_200305.txt
114 | AFP/afp_eng_199408.txt
115 | AFP/afp_eng_200210.txt
116 | AFP/afp_eng_199607.txt
117 | AFP/afp_eng_201003.txt
118 | AFP/afp_eng_200605.txt
119 | AFP/afp_eng_201011.txt
120 | AFP/afp_eng_201007.txt
121 | AFP/afp_eng_200401.txt
122 | AFP/afp_eng_200602.txt
123 | AFP/afp_eng_201005.txt
124 | AFP/afp_eng_200709.txt
125 | AFP/afp_eng_200302.txt
126 | AFP/afp_eng_200909.txt
127 | AFP/afp_eng_200609.txt
128 | AFP/afp_eng_200808.txt
129 | AFP/afp_eng_200411.txt
130 | AFP/afp_eng_199508.txt
131 | AFP/afp_eng_199605.txt
132 | AFP/afp_eng_200409.txt
133 | APW/apw_eng_201001.txt
134 | APW/apw_eng_199501.txt
135 | APW/apw_eng_200307.txt
136 | APW/apw_eng_200902.txt
137 | APW/apw_eng_200303.txt
138 | APW/apw_eng_200304.txt
139 | APW/apw_eng_200503.txt
140 | APW/apw_eng_200905.txt
141 | APW/apw_eng_200111.txt
142 | APW/apw_eng_200301.txt
143 | APW/apw_eng_199712.txt
144 | APW/apw_eng_199612.txt
145 | APW/apw_eng_200011.txt
146 | APW/apw_eng_199503.txt
147 | APW/apw_eng_200106.txt
148 | APW/apw_eng_200802.txt
149 | APW/apw_eng_200007.txt
150 | APW/apw_eng_199905.txt
151 | APW/apw_eng_201009.txt
152 | APW/apw_eng_200109.txt
153 | APW/apw_eng_200612.txt
154 | APW/apw_eng_200702.txt
155 | APW/apw_eng_199609.txt
156 | APW/apw_eng_199909.txt
157 | APW/apw_eng_199702.txt
158 | APW/apw_eng_200805.txt
159 | APW/apw_eng_199902.txt
160 | APW/apw_eng_201011.txt
161 | APW/apw_eng_200107.txt
162 | APW/apw_eng_200611.txt
163 | APW/apw_eng_200904.txt
164 | APW/apw_eng_200006.txt
165 | APW/apw_eng_200505.txt
166 | APW/apw_eng_200810.txt
167 | APW/apw_eng_199801.txt
168 | APW/apw_eng_200808.txt
169 | APW/apw_eng_200607.txt
170 | APW/apw_eng_200404.txt
171 | APW/apw_eng_199803.txt
172 | APW/apw_eng_199611.txt
173 | APW/apw_eng_200406.txt
174 | APW/apw_eng_200211.txt
175 | APW/apw_eng_199911.txt
176 | APW/apw_eng_200912.txt
177 | APW/apw_eng_200809.txt
178 | APW/apw_eng_199710.txt
179 | APW/apw_eng_199907.txt
180 | APW/apw_eng_199607.txt
181 | APW/apw_eng_199506.txt
182 | APW/apw_eng_200605.txt
183 | APW/apw_eng_199502.txt
184 | APW/apw_eng_199505.txt
185 | APW/apw_eng_200811.txt
186 | APW/apw_eng_200401.txt
187 | APW/apw_eng_200602.txt
188 | APW/apw_eng_200512.txt
189 | APW/apw_eng_200711.txt
190 | APW/apw_eng_200909.txt
191 | APW/apw_eng_200201.txt
192 | APW/apw_eng_200202.txt
193 | APW/apw_eng_200103.txt
194 | APW/apw_eng_199604.txt
195 | APW/apw_eng_199508.txt
196 | APW/apw_eng_199711.txt
197 | APW/apw_eng_200310.txt
198 | APW/apw_eng_200209.txt
199 | APW/apw_eng_199809.txt
200 | APW/apw_eng_199411.txt
201 | APW/apw_eng_200003.txt
202 | APW/apw_eng_200903.txt
203 | APW/apw_eng_199903.txt
204 | APW/apw_eng_199512.txt
205 | APW/apw_eng_200104.txt
206 | APW/apw_eng_201006.txt
207 | APW/apw_eng_200005.txt
208 | APW/apw_eng_200405.txt
209 | APW/apw_eng_199906.txt
210 | APW/apw_eng_199904.txt
211 | APW/apw_eng_199510.txt
212 | APW/apw_eng_200112.txt
213 | APW/apw_eng_200508.txt
214 | APW/apw_eng_200108.txt
215 | APW/apw_eng_200403.txt
216 | APW/apw_eng_201010.txt
217 | APW/apw_eng_200906.txt
218 | APW/apw_eng_201002.txt
219 | APW/apw_eng_200910.txt
220 | APW/apw_eng_199806.txt
221 | APW/apw_eng_200806.txt
222 | APW/apw_eng_199504.txt
223 | APW/apw_eng_200207.txt
224 | APW/apw_eng_201004.txt
225 | APW/apw_eng_200208.txt
226 | APW/apw_eng_199709.txt
227 | APW/apw_eng_200812.txt
228 | APW/apw_eng_200710.txt
229 | APW/apw_eng_200410.txt
230 | APW/apw_eng_200712.txt
231 | APW/apw_eng_200001.txt
232 | APW/apw_eng_201012.txt
233 | APW/apw_eng_200402.txt
234 | APW/apw_eng_200804.txt
235 | APW/apw_eng_199610.txt
236 | APW/apw_eng_200009.txt
237 | APW/apw_eng_200511.txt
238 | APW/apw_eng_199602.txt
239 | APW/apw_eng_199601.txt
240 | APW/apw_eng_200901.txt
241 | APW/apw_eng_199704.txt
242 | APW/apw_eng_200308.txt
243 | APW/apw_eng_200604.txt
244 | APW/apw_eng_200701.txt
245 | APW/apw_eng_200704.txt
246 | APW/apw_eng_199603.txt
247 | APW/apw_eng_200408.txt
248 | APW/apw_eng_200911.txt
249 | APW/apw_eng_199511.txt
250 | APW/apw_eng_200510.txt
251 | APW/apw_eng_200803.txt
252 | APW/apw_eng_199802.txt
253 | APW/apw_eng_200501.txt
254 | APW/apw_eng_200706.txt
255 | APW/apw_eng_200610.txt
256 | APW/apw_eng_199804.txt
257 | APW/apw_eng_200507.txt
258 | APW/apw_eng_200801.txt
259 | APW/apw_eng_199908.txt
260 | APW/apw_eng_201007.txt
261 | APW/apw_eng_200601.txt
262 | APW/apw_eng_200306.txt
263 | APW/apw_eng_200407.txt
264 | APW/apw_eng_200212.txt
265 | APW/apw_eng_199910.txt
266 | APW/apw_eng_200004.txt
267 | APW/apw_eng_200312.txt
268 | APW/apw_eng_201003.txt
269 | APW/apw_eng_199701.txt
270 | APW/apw_eng_200008.txt
271 | APW/apw_eng_200012.txt
272 | APW/apw_eng_201005.txt
273 | APW/apw_eng_200709.txt
274 | APW/apw_eng_200105.txt
275 | APW/apw_eng_200302.txt
276 | APW/apw_eng_200101.txt
277 | APW/apw_eng_200609.txt
278 | APW/apw_eng_200603.txt
279 | APW/apw_eng_199901.txt
280 | APW/apw_eng_200002.txt
281 | APW/apw_eng_200504.txt
282 | APW/apw_eng_200606.txt
283 | APW/apw_eng_200409.txt
284 | APW/apw_eng_199509.txt
285 | APW/apw_eng_199412.txt
286 | APW/apw_eng_200311.txt
287 | APW/apw_eng_200203.txt
288 | APW/apw_eng_200703.txt
289 | APW/apw_eng_200707.txt
290 | APW/apw_eng_200509.txt
291 | APW/apw_eng_200102.txt
292 | APW/apw_eng_200705.txt
293 | APW/apw_eng_201008.txt
294 | APW/apw_eng_200807.txt
295 | APW/apw_eng_200502.txt
296 | APW/apw_eng_200110.txt
297 | APW/apw_eng_200010.txt
298 | APW/apw_eng_199705.txt
299 | APW/apw_eng_199706.txt
300 | APW/apw_eng_200206.txt
301 | APW/apw_eng_199703.txt
302 | APW/apw_eng_199805.txt
303 | APW/apw_eng_200411.txt
304 | APW/apw_eng_199507.txt
305 | CNA/cna_eng_200608.txt
306 | CNA/cna_eng_200906.txt
307 | CNA/cna_eng_200110.txt
308 | CNA/cna_eng_199712.txt
309 | CNA/cna_eng_200609.txt
310 | CNA/cna_eng_199903.txt
311 | CNA/cna_eng_200111.txt
312 | CNA/cna_eng_200712.txt
313 | CNA/cna_eng_200808.txt
314 | CNA/cna_eng_200006.txt
315 | CNA/cna_eng_199803.txt
316 | CNA/cna_eng_200811.txt
317 | CNA/cna_eng_200004.txt
318 | CNA/cna_eng_199906.txt
319 | CNA/cna_eng_200009.txt
320 | CNA/cna_eng_200401.txt
321 | CNA/cna_eng_200602.txt
322 | CNA/cna_eng_200802.txt
323 | CNA/cna_eng_200108.txt
324 | CNA/cna_eng_200501.txt
325 | CNA/cna_eng_200106.txt
326 | CNA/cna_eng_200203.txt
327 | CNA/cna_eng_200903.txt
328 | CNA/cna_eng_200812.txt
329 | CNA/cna_eng_200911.txt
330 | CNA/cna_eng_200505.txt
331 | CNA/cna_eng_199710.txt
332 | CNA/cna_eng_200806.txt
333 | CNA/cna_eng_200311.txt
334 | CNA/cna_eng_200507.txt
335 | CNA/cna_eng_200809.txt
336 | CNA/cna_eng_200010.txt
337 | CNA/cna_eng_200312.txt
338 | CNA/cna_eng_199802.txt
339 | CNA/cna_eng_200807.txt
340 | CNA/cna_eng_199908.txt
341 | CNA/cna_eng_200202.txt
342 | CNA/cna_eng_201002.txt
343 | CNA/cna_eng_200512.txt
344 | CNA/cna_eng_200309.txt
345 | CNA/cna_eng_200607.txt
346 | CNA/cna_eng_199711.txt
347 | CNA/cna_eng_199809.txt
348 | CNA/cna_eng_200805.txt
349 | CNA/cna_eng_200610.txt
350 | CNA/cna_eng_200109.txt
351 | CNA/cna_eng_200007.txt
352 | CNA/cna_eng_200703.txt
353 | CNA/cna_eng_200201.txt
354 | CNA/cna_eng_199904.txt
355 | CNA/cna_eng_199806.txt
356 | CNA/cna_eng_200410.txt
357 | CNA/cna_eng_200001.txt
358 | CNA/cna_eng_200709.txt
359 | CNA/cna_eng_200408.txt
360 | CNA/cna_eng_200711.txt
361 | CNA/cna_eng_200101.txt
362 | CNA/cna_eng_201003.txt
363 | CNA/cna_eng_199805.txt
364 | CNA/cna_eng_200012.txt
365 | CNA/cna_eng_199804.txt
366 | CNA/cna_eng_200907.txt
367 | CNA/cna_eng_200502.txt
368 | CNA/cna_eng_200603.txt
369 | CNA/cna_eng_199911.txt
370 | CNA/cna_eng_200902.txt
371 | CNA/cna_eng_200605.txt
372 | CNA/cna_eng_200107.txt
373 | CNA/cna_eng_200611.txt
374 | CNA/cna_eng_201008.txt
375 | CNA/cna_eng_200409.txt
376 | CNA/cna_eng_200412.txt
377 | CNA/cna_eng_200503.txt
378 | CNA/cna_eng_200005.txt
379 | CNA/cna_eng_200905.txt
380 | CNA/cna_eng_200105.txt
381 | CNA/cna_eng_199905.txt
382 | CNA/cna_eng_200511.txt
383 | CNA/cna_eng_199902.txt
384 | CNA/cna_eng_200704.txt
385 | CNA/cna_eng_200901.txt
386 | CNA/cna_eng_199808.txt
387 | CNA/cna_eng_201009.txt
388 | CNA/cna_eng_200810.txt
389 | CNA/cna_eng_201011.txt
390 | CNA/cna_eng_200708.txt
391 | CNA/cna_eng_200402.txt
392 | CNA/cna_eng_200604.txt
393 | CNA/cna_eng_201006.txt
394 | CNA/cna_eng_200008.txt
395 | CNA/cna_eng_201001.txt
396 | CNA/cna_eng_200509.txt
397 | CNA/cna_eng_200510.txt
398 | CNA/cna_eng_200405.txt
399 | CNA/cna_eng_200801.txt
400 | CNA/cna_eng_199912.txt
401 | CNA/cna_eng_200104.txt
402 | CNA/cna_eng_200307.txt
403 | CNA/cna_eng_201010.txt
404 | CNA/cna_eng_200506.txt
405 | CNA/cna_eng_200612.txt
406 | CNA/cna_eng_200706.txt
407 | CNA/cna_eng_200701.txt
408 | CNA/cna_eng_200804.txt
409 | CNA/cna_eng_199709.txt
410 | CNA/cna_eng_200411.txt
411 | CNA/cna_eng_199901.txt
412 | CNA/cna_eng_200002.txt
413 | CNA/cna_eng_200508.txt
414 | CNA/cna_eng_200310.txt
415 | CNA/cna_eng_200908.txt
416 | CNA/cna_eng_199907.txt
417 | CNA/cna_eng_200606.txt
418 | CNA/cna_eng_200601.txt
419 | CNA/cna_eng_200702.txt
420 | CNA/cna_eng_200909.txt
421 | CNA/cna_eng_199807.txt
422 | CNA/cna_eng_199909.txt
423 | CNA/cna_eng_200404.txt
424 | CNA/cna_eng_200403.txt
425 | CNA/cna_eng_200406.txt
426 | CNA/cna_eng_200707.txt
427 | CNA/cna_eng_199910.txt
428 | CNA/cna_eng_200705.txt
429 | CNA/cna_eng_200011.txt
430 | CNA/cna_eng_201004.txt
431 | CNA/cna_eng_199801.txt
432 | LTW/ltw_eng_200405.txt
433 | LTW/ltw_eng_199710.txt
434 | LTW/ltw_eng_200311.txt
435 | LTW/ltw_eng_200507.txt
436 | LTW/ltw_eng_200809.txt
437 | LTW/ltw_eng_199801.txt
438 | LTW/ltw_eng_199406.txt
439 | LTW/ltw_eng_200506.txt
440 | LTW/ltw_eng_199704.txt
441 | LTW/ltw_eng_199508.txt
442 | LTW/ltw_eng_200409.txt
443 | LTW/ltw_eng_200412.txt
444 | LTW/ltw_eng_200710.txt
445 | LTW/ltw_eng_200904.txt
446 | LTW/ltw_eng_199603.txt
447 | LTW/ltw_eng_199512.txt
448 | LTW/ltw_eng_200411.txt
449 | LTW/ltw_eng_200603.txt
450 | LTW/ltw_eng_200810.txt
451 | LTW/ltw_eng_200401.txt
452 | LTW/ltw_eng_200410.txt
453 | LTW/ltw_eng_199411.txt
454 | LTW/ltw_eng_200404.txt
455 | LTW/ltw_eng_199705.txt
456 | LTW/ltw_eng_200510.txt
457 | LTW/ltw_eng_199804.txt
458 | LTW/ltw_eng_200705.txt
459 | LTW/ltw_eng_200812.txt
460 | LTW/ltw_eng_200911.txt
461 | LTW/ltw_eng_200502.txt
462 | LTW/ltw_eng_199501.txt
463 | LTW/ltw_eng_199506.txt
464 | LTW/ltw_eng_200611.txt
465 | LTW/ltw_eng_200804.txt
466 | LTW/ltw_eng_199701.txt
467 | LTW/ltw_eng_199711.txt
468 | LTW/ltw_eng_199601.txt
469 | LTW/ltw_eng_199606.txt
470 | LTW/ltw_eng_200704.txt
471 | LTW/ltw_eng_199702.txt
472 | LTW/ltw_eng_200703.txt
473 | LTW/ltw_eng_200308.txt
474 | LTW/ltw_eng_200602.txt
475 | LTW/ltw_eng_199703.txt
476 | LTW/ltw_eng_200708.txt
477 | LTW/ltw_eng_200604.txt
478 | LTW/ltw_eng_200711.txt
479 | LTW/ltw_eng_200909.txt
480 | LTW/ltw_eng_200509.txt
481 | LTW/ltw_eng_200406.txt
482 | LTW/ltw_eng_199612.txt
483 | LTW/ltw_eng_199608.txt
484 | LTW/ltw_eng_200505.txt
485 | LTW/ltw_eng_200912.txt
486 | LTW/ltw_eng_199412.txt
487 | LTW/ltw_eng_200709.txt
488 | LTW/ltw_eng_200910.txt
489 | LTW/ltw_eng_200612.txt
490 | LTW/ltw_eng_199405.txt
491 | LTW/ltw_eng_199510.txt
492 | LTW/ltw_eng_199407.txt
493 | LTW/ltw_eng_200803.txt
494 | LTW/ltw_eng_200607.txt
495 | LTW/ltw_eng_199712.txt
496 | LTW/ltw_eng_199611.txt
497 | LTW/ltw_eng_200609.txt
498 | LTW/ltw_eng_200503.txt
499 | LTW/ltw_eng_199605.txt
500 | LTW/ltw_eng_199709.txt
501 | LTW/ltw_eng_200808.txt
502 | LTW/ltw_eng_200907.txt
503 | LTW/ltw_eng_200902.txt
504 | LTW/ltw_eng_199707.txt
505 | LTW/ltw_eng_200811.txt
506 | LTW/ltw_eng_199409.txt
507 | LTW/ltw_eng_199410.txt
508 | LTW/ltw_eng_200908.txt
509 | LTW/ltw_eng_199609.txt
510 | LTW/ltw_eng_199408.txt
511 | LTW/ltw_eng_200601.txt
512 | LTW/ltw_eng_200402.txt
513 | LTW/ltw_eng_200501.txt
514 | LTW/ltw_eng_199504.txt
515 | LTW/ltw_eng_199805.txt
516 | LTW/ltw_eng_199511.txt
517 | LTW/ltw_eng_199505.txt
518 | LTW/ltw_eng_199610.txt
519 | LTW/ltw_eng_200801.txt
520 | LTW/ltw_eng_200806.txt
521 | LTW/ltw_eng_199802.txt
522 | LTW/ltw_eng_200807.txt
523 | LTW/ltw_eng_199507.txt
524 | LTW/ltw_eng_200309.txt
525 | LTW/ltw_eng_200706.txt
526 | LTW/ltw_eng_200701.txt
527 | LTW/ltw_eng_199708.txt
528 | LTW/ltw_eng_199502.txt
529 | LTW/ltw_eng_200712.txt
530 | LTW/ltw_eng_200511.txt
531 | LTW/ltw_eng_200610.txt
532 | LTW/ltw_eng_200905.txt
533 | LTW/ltw_eng_200901.txt
534 | LTW/ltw_eng_200903.txt
535 | LTW/ltw_eng_199806.txt
536 | LTW/ltw_eng_200508.txt
537 | LTW/ltw_eng_200802.txt
538 | LTW/ltw_eng_200702.txt
539 | LTW/ltw_eng_200408.txt
540 | LTW/ltw_eng_199604.txt
541 | LTW/ltw_eng_200403.txt
542 | LTW/ltw_eng_199607.txt
543 | LTW/ltw_eng_199602.txt
544 | LTW/ltw_eng_200504.txt
545 | LTW/ltw_eng_200707.txt
546 | LTW/ltw_eng_199706.txt
547 | NYT/nyt_eng_200110.txt
548 | NYT/nyt_eng_200904.txt
549 | NYT/nyt_eng_200903.txt
550 | NYT/nyt_eng_200707.txt
551 | NYT/nyt_eng_199505.txt
552 | NYT/nyt_eng_200703.txt
553 | NYT/nyt_eng_200704.txt
554 | NYT/nyt_eng_200103.txt
555 | NYT/nyt_eng_199701.txt
556 | NYT/nyt_eng_199502.txt
557 | NYT/nyt_eng_200511.txt
558 | NYT/nyt_eng_200701.txt
559 | NYT/nyt_eng_200602.txt
560 | NYT/nyt_eng_200902.txt
561 | NYT/nyt_eng_200411.txt
562 | NYT/nyt_eng_199411.txt
563 | NYT/nyt_eng_200506.txt
564 | NYT/nyt_eng_201007.txt
565 | NYT/nyt_eng_199711.txt
566 | NYT/nyt_eng_200407.txt
567 | NYT/nyt_eng_200612.txt
568 | NYT/nyt_eng_200709.txt
569 | NYT/nyt_eng_199806.txt
570 | NYT/nyt_eng_201009.txt
571 | NYT/nyt_eng_200509.txt
572 | NYT/nyt_eng_200212.txt
573 | NYT/nyt_eng_200302.txt
574 | NYT/nyt_eng_200909.txt
575 | NYT/nyt_eng_200804.txt
576 | NYT/nyt_eng_200803.txt
577 | NYT/nyt_eng_200812.txt
578 | NYT/nyt_eng_200507.txt
579 | NYT/nyt_eng_200211.txt
580 | NYT/nyt_eng_199705.txt
581 | NYT/nyt_eng_200905.txt
582 | NYT/nyt_eng_200911.txt
583 | NYT/nyt_eng_200907.txt
584 | NYT/nyt_eng_200105.txt
585 | NYT/nyt_eng_199608.txt
586 | NYT/nyt_eng_199808.txt
587 | NYT/nyt_eng_200207.txt
588 | NYT/nyt_eng_200004.txt
589 | NYT/nyt_eng_199703.txt
590 | NYT/nyt_eng_200006.txt
591 | NYT/nyt_eng_199905.txt
592 | NYT/nyt_eng_201006.txt
593 | NYT/nyt_eng_199802.txt
594 | NYT/nyt_eng_199903.txt
595 | NYT/nyt_eng_200705.txt
596 | NYT/nyt_eng_201012.txt
597 | NYT/nyt_eng_200610.txt
598 | NYT/nyt_eng_199801.txt
599 | NYT/nyt_eng_199410.txt
600 | NYT/nyt_eng_200001.txt
601 | NYT/nyt_eng_200202.txt
602 | NYT/nyt_eng_199412.txt
603 | NYT/nyt_eng_199702.txt
604 | NYT/nyt_eng_200112.txt
605 | NYT/nyt_eng_200311.txt
606 | NYT/nyt_eng_199611.txt
607 | NYT/nyt_eng_199912.txt
608 | NYT/nyt_eng_200011.txt
609 | NYT/nyt_eng_200002.txt
610 | NYT/nyt_eng_200710.txt
611 | NYT/nyt_eng_200609.txt
612 | NYT/nyt_eng_201002.txt
613 | NYT/nyt_eng_200403.txt
614 | NYT/nyt_eng_199504.txt
615 | NYT/nyt_eng_200809.txt
616 | NYT/nyt_eng_200504.txt
617 | NYT/nyt_eng_199708.txt
618 | NYT/nyt_eng_201001.txt
619 | NYT/nyt_eng_199610.txt
620 | NYT/nyt_eng_200405.txt
621 | NYT/nyt_eng_200005.txt
622 | NYT/nyt_eng_200611.txt
623 | NYT/nyt_eng_200605.txt
624 | NYT/nyt_eng_199907.txt
625 | NYT/nyt_eng_199601.txt
626 | NYT/nyt_eng_200512.txt
627 | NYT/nyt_eng_199510.txt
628 | NYT/nyt_eng_199901.txt
629 | NYT/nyt_eng_199607.txt
630 | NYT/nyt_eng_200508.txt
631 | NYT/nyt_eng_200908.txt
632 | NYT/nyt_eng_200810.txt
633 | NYT/nyt_eng_199902.txt
634 | NYT/nyt_eng_199501.txt
635 | NYT/nyt_eng_199707.txt
636 | NYT/nyt_eng_200607.txt
637 | NYT/nyt_eng_200608.txt
638 | NYT/nyt_eng_199804.txt
639 | NYT/nyt_eng_200109.txt
640 | NYT/nyt_eng_199908.txt
641 | NYT/nyt_eng_200805.txt
642 | NYT/nyt_eng_200310.txt
643 | NYT/nyt_eng_200502.txt
644 | NYT/nyt_eng_199606.txt
645 | NYT/nyt_eng_200312.txt
646 | NYT/nyt_eng_200401.txt
647 | NYT/nyt_eng_199409.txt
648 | NYT/nyt_eng_199909.txt
649 | NYT/nyt_eng_200409.txt
650 | NYT/nyt_eng_199509.txt
651 | NYT/nyt_eng_199503.txt
652 | NYT/nyt_eng_199604.txt
653 | NYT/nyt_eng_200901.txt
654 | NYT/nyt_eng_199506.txt
655 | NYT/nyt_eng_200708.txt
656 | NYT/nyt_eng_200204.txt
657 | NYT/nyt_eng_200301.txt
658 | NYT/nyt_eng_200304.txt
659 | NYT/nyt_eng_200910.txt
660 | NYT/nyt_eng_200008.txt
661 | NYT/nyt_eng_199407.txt
662 | NYT/nyt_eng_199508.txt
663 | NYT/nyt_eng_199609.txt
664 | NYT/nyt_eng_199710.txt
665 | NYT/nyt_eng_200101.txt
666 | NYT/nyt_eng_199602.txt
667 | NYT/nyt_eng_200210.txt
668 | NYT/nyt_eng_200107.txt
669 | NYT/nyt_eng_200108.txt
670 | NYT/nyt_eng_200308.txt
671 | NYT/nyt_eng_200801.txt
672 | NYT/nyt_eng_199712.txt
673 | NYT/nyt_eng_200802.txt
674 | NYT/nyt_eng_200912.txt
675 | NYT/nyt_eng_200807.txt
676 | NYT/nyt_eng_200201.txt
677 | NYT/nyt_eng_200706.txt
678 | NYT/nyt_eng_200007.txt
679 | NYT/nyt_eng_200404.txt
680 | NYT/nyt_eng_199803.txt
681 | NYT/nyt_eng_200712.txt
682 | NYT/nyt_eng_199408.txt
683 | NYT/nyt_eng_200408.txt
684 | NYT/nyt_eng_199603.txt
685 | NYT/nyt_eng_200412.txt
686 | NYT/nyt_eng_200106.txt
687 | NYT/nyt_eng_200309.txt
688 | NYT/nyt_eng_201010.txt
689 | NYT/nyt_eng_200811.txt
690 | NYT/nyt_eng_200702.txt
691 | NYT/nyt_eng_200501.txt
692 | NYT/nyt_eng_200209.txt
693 | NYT/nyt_eng_200906.txt
694 | NYT/nyt_eng_200402.txt
695 | NYT/nyt_eng_200104.txt
696 | NYT/nyt_eng_199911.txt
697 | NYT/nyt_eng_200206.txt
698 | NYT/nyt_eng_199805.txt
699 | NYT/nyt_eng_200009.txt
700 | NYT/nyt_eng_200711.txt
701 | NYT/nyt_eng_200806.txt
702 | NYT/nyt_eng_200603.txt
703 | NYT/nyt_eng_201003.txt
704 | NYT/nyt_eng_200604.txt
705 | NYT/nyt_eng_200303.txt
706 | NYT/nyt_eng_200208.txt
707 | NYT/nyt_eng_199511.txt
708 | NYT/nyt_eng_200010.txt
709 | NYT/nyt_eng_199605.txt
710 | NYT/nyt_eng_200102.txt
711 | NYT/nyt_eng_199904.txt
712 | NYT/nyt_eng_199807.txt
713 | NYT/nyt_eng_200510.txt
714 | NYT/nyt_eng_199507.txt
715 | NYT/nyt_eng_200410.txt
716 | NYT/nyt_eng_199906.txt
717 | NYT/nyt_eng_199706.txt
718 | NYT/nyt_eng_200012.txt
719 | NYT/nyt_eng_200111.txt
720 | NYT/nyt_eng_201008.txt
721 | NYT/nyt_eng_200606.txt
722 | NYT/nyt_eng_200503.txt
723 | WPB/wpb_eng_201012.txt
724 | WPB/wpb_eng_201007.txt
725 | WPB/wpb_eng_201008.txt
726 | WPB/wpb_eng_201003.txt
727 | WPB/wpb_eng_201004.txt
728 | WPB/wpb_eng_201010.txt
729 | WPB/wpb_eng_201001.txt
730 | WPB/wpb_eng_201006.txt
731 | WPB/wpb_eng_201009.txt
732 | WPB/wpb_eng_201002.txt
733 | WPB/wpb_eng_201005.txt
734 | WPB/wpb_eng_201011.txt
735 | XIN/xin_eng_199708.txt
736 | XIN/xin_eng_200303.txt
737 | XIN/xin_eng_199701.txt
738 | XIN/xin_eng_200305.txt
739 | XIN/xin_eng_200208.txt
740 | XIN/xin_eng_200203.txt
741 | XIN/xin_eng_199807.txt
742 | XIN/xin_eng_199912.txt
743 | XIN/xin_eng_200302.txt
744 | XIN/xin_eng_201010.txt
745 | XIN/xin_eng_200612.txt
746 | XIN/xin_eng_199706.txt
747 | XIN/xin_eng_200104.txt
748 | XIN/xin_eng_200912.txt
749 | XIN/xin_eng_200412.txt
750 | XIN/xin_eng_201005.txt
751 | XIN/xin_eng_200507.txt
752 | XIN/xin_eng_199609.txt
753 | XIN/xin_eng_199910.txt
754 | XIN/xin_eng_200506.txt
755 | XIN/xin_eng_200404.txt
756 | XIN/xin_eng_200712.txt
757 | XIN/xin_eng_200401.txt
758 | XIN/xin_eng_200110.txt
759 | XIN/xin_eng_199502.txt
760 | XIN/xin_eng_200312.txt
761 | XIN/xin_eng_200005.txt
762 | XIN/xin_eng_200602.txt
763 | XIN/xin_eng_200002.txt
764 | XIN/xin_eng_199907.txt
765 | XIN/xin_eng_199608.txt
766 | XIN/xin_eng_199711.txt
767 | XIN/xin_eng_200207.txt
768 | XIN/xin_eng_201006.txt
769 | XIN/xin_eng_200710.txt
770 | XIN/xin_eng_199506.txt
771 | XIN/xin_eng_200201.txt
772 | XIN/xin_eng_200706.txt
773 | XIN/xin_eng_200909.txt
774 | XIN/xin_eng_199504.txt
775 | XIN/xin_eng_200705.txt
776 | XIN/xin_eng_200806.txt
777 | XIN/xin_eng_201003.txt
778 | XIN/xin_eng_200604.txt
779 | XIN/xin_eng_200109.txt
780 | XIN/xin_eng_199606.txt
781 | XIN/xin_eng_200410.txt
782 | XIN/xin_eng_200905.txt
783 | XIN/xin_eng_200101.txt
784 | XIN/xin_eng_199909.txt
785 | XIN/xin_eng_200105.txt
786 | XIN/xin_eng_200102.txt
787 | XIN/xin_eng_199503.txt
788 | XIN/xin_eng_200408.txt
789 | XIN/xin_eng_200107.txt
790 | XIN/xin_eng_200004.txt
791 | XIN/xin_eng_199604.txt
792 | XIN/xin_eng_199610.txt
793 | XIN/xin_eng_200606.txt
794 | XIN/xin_eng_200409.txt
795 | XIN/xin_eng_200403.txt
796 | XIN/xin_eng_200301.txt
797 | XIN/xin_eng_200608.txt
798 | XIN/xin_eng_200903.txt
799 | XIN/xin_eng_199801.txt
800 | XIN/xin_eng_199508.txt
801 | XIN/xin_eng_200502.txt
802 | XIN/xin_eng_200701.txt
803 | XIN/xin_eng_199705.txt
804 | XIN/xin_eng_199702.txt
805 | XIN/xin_eng_200111.txt
806 | XIN/xin_eng_201012.txt
807 | XIN/xin_eng_199808.txt
808 | XIN/xin_eng_199507.txt
809 | XIN/xin_eng_200509.txt
810 | XIN/xin_eng_199911.txt
811 | XIN/xin_eng_200802.txt
812 | XIN/xin_eng_200901.txt
813 | XIN/xin_eng_201009.txt
814 | XIN/xin_eng_199501.txt
815 | XIN/xin_eng_199805.txt
816 | XIN/xin_eng_200007.txt
817 | XIN/xin_eng_200309.txt
818 | XIN/xin_eng_199804.txt
819 | XIN/xin_eng_200209.txt
820 | XIN/xin_eng_200205.txt
821 | XIN/xin_eng_201001.txt
822 | XIN/xin_eng_201002.txt
823 | XIN/xin_eng_200103.txt
824 | XIN/xin_eng_199511.txt
825 | XIN/xin_eng_200210.txt
826 | XIN/xin_eng_200611.txt
827 | XIN/xin_eng_199601.txt
828 | XIN/xin_eng_199605.txt
829 | XIN/xin_eng_199602.txt
830 | XIN/xin_eng_201008.txt
831 | XIN/xin_eng_199607.txt
832 | XIN/xin_eng_199906.txt
833 | XIN/xin_eng_200508.txt
834 | XIN/xin_eng_199902.txt
835 | XIN/xin_eng_199806.txt
836 | XIN/xin_eng_200609.txt
837 | XIN/xin_eng_200009.txt
838 | XIN/xin_eng_200211.txt
839 | XIN/xin_eng_200603.txt
840 | XIN/xin_eng_199803.txt
841 | XIN/xin_eng_201004.txt
842 | XIN/xin_eng_200703.txt
843 | XIN/xin_eng_200704.txt
844 | XIN/xin_eng_200405.txt
845 | XIN/xin_eng_200010.txt
846 | XIN/xin_eng_200911.txt
847 | XIN/xin_eng_201011.txt
848 | XIN/xin_eng_199612.txt
849 | XIN/xin_eng_200501.txt
850 | XIN/xin_eng_199509.txt
851 | XIN/xin_eng_201007.txt
852 | XIN/xin_eng_200503.txt
853 | XIN/xin_eng_200003.txt
854 | XIN/xin_eng_200908.txt
855 | XIN/xin_eng_200601.txt
856 | XIN/xin_eng_200402.txt
857 | XIN/xin_eng_200012.txt
858 | XIN/xin_eng_200808.txt
859 | XIN/xin_eng_199707.txt
860 | XIN/xin_eng_199903.txt
861 | XIN/xin_eng_200803.txt
862 | XIN/xin_eng_200512.txt
863 | XIN/xin_eng_200904.txt
864 | XIN/xin_eng_200008.txt
865 | XIN/xin_eng_199505.txt
866 | XIN/xin_eng_200805.txt
867 | XIN/xin_eng_200307.txt
868 | XIN/xin_eng_199603.txt
869 | XIN/xin_eng_200001.txt
870 | XIN/xin_eng_200907.txt
871 | XIN/xin_eng_200311.txt
872 | XIN/xin_eng_200510.txt
873 | XIN/xin_eng_200906.txt
874 | XIN/xin_eng_200006.txt
875 | XIN/xin_eng_199905.txt
876 | XIN/xin_eng_199809.txt
877 | XIN/xin_eng_199512.txt
878 | XIN/xin_eng_199709.txt
879 | XIN/xin_eng_200809.txt
880 | XIN/xin_eng_200304.txt
881 | XIN/xin_eng_200308.txt
882 | XIN/xin_eng_200812.txt
883 | XIN/xin_eng_200504.txt
884 | XIN/xin_eng_200707.txt
885 | XIN/xin_eng_200810.txt
886 | XIN/xin_eng_200202.txt
887 | XIN/xin_eng_199710.txt
888 | XIN/xin_eng_200607.txt
889 | XIN/xin_eng_200605.txt
890 | XIN/xin_eng_200811.txt
891 | XIN/xin_eng_200108.txt
892 | XIN/xin_eng_200011.txt
893 | XIN/xin_eng_200708.txt
894 | XIN/xin_eng_199703.txt
895 | XIN/xin_eng_200801.txt
896 | XIN/xin_eng_200505.txt
897 | XIN/xin_eng_200709.txt
898 | XIN/xin_eng_199712.txt
899 | XIN/xin_eng_200807.txt
900 | XIN/xin_eng_200206.txt
901 | XIN/xin_eng_200204.txt
902 | XIN/xin_eng_200610.txt
903 | XIN/xin_eng_200910.txt
904 | XIN/xin_eng_199611.txt
905 | 


--------------------------------------------------------------------------------
/dataset/valid.splits:
--------------------------------------------------------------------------------
 1 | AFP/afp_eng_200601.txt
 2 | AFP/afp_eng_199702.txt
 3 | AFP/afp_eng_200506.txt
 4 | AFP/afp_eng_200308.txt
 5 | AFP/afp_eng_200703.txt
 6 | AFP/afp_eng_199502.txt
 7 | AFP/afp_eng_199406.txt
 8 | APW/apw_eng_199606.txt
 9 | APW/apw_eng_199807.txt
10 | APW/apw_eng_200204.txt
11 | APW/apw_eng_199708.txt
12 | APW/apw_eng_200907.txt
13 | APW/apw_eng_200309.txt
14 | APW/apw_eng_200205.txt
15 | APW/apw_eng_200210.txt
16 | APW/apw_eng_200608.txt
17 | CNA/cna_eng_200710.txt
18 | CNA/cna_eng_200912.txt
19 | CNA/cna_eng_200102.txt
20 | CNA/cna_eng_200803.txt
21 | CNA/cna_eng_200504.txt
22 | CNA/cna_eng_200003.txt
23 | CNA/cna_eng_201005.txt
24 | LTW/ltw_eng_200310.txt
25 | LTW/ltw_eng_200512.txt
26 | LTW/ltw_eng_199503.txt
27 | LTW/ltw_eng_200407.txt
28 | LTW/ltw_eng_199803.txt
29 | LTW/ltw_eng_199509.txt
30 | NYT/nyt_eng_200205.txt
31 | NYT/nyt_eng_200203.txt
32 | NYT/nyt_eng_199512.txt
33 | NYT/nyt_eng_200307.txt
34 | NYT/nyt_eng_199709.txt
35 | NYT/nyt_eng_199704.txt
36 | NYT/nyt_eng_201011.txt
37 | NYT/nyt_eng_200003.txt
38 | NYT/nyt_eng_200306.txt
39 | XIN/xin_eng_199901.txt
40 | XIN/xin_eng_200702.txt
41 | XIN/xin_eng_200407.txt
42 | XIN/xin_eng_199904.txt
43 | XIN/xin_eng_200406.txt
44 | XIN/xin_eng_200306.txt
45 | XIN/xin_eng_199510.txt
46 | XIN/xin_eng_199908.txt
47 | XIN/xin_eng_200212.txt
48 | 


--------------------------------------------------------------------------------
/prep_torch_data.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | COUNT=5
 4 | WINDOW=5
 5 | 
 6 | DATA_DIR=$1
 7 | OUT_DIR=$1/processed
 8 | SCRIPTS=$ABS/summary
 9 | 
10 | export LUA_PATH="$LUA_PATH;$ABS/?.lua"
11 | 
12 | mkdir -p $OUT_DIR
13 | 
14 | th $SCRIPTS/build_dict.lua -inf $DATA_DIR/train.article.dict -outf $OUT_DIR/train.article.dict.torch
15 | th $SCRIPTS/build_dict.lua -inf $DATA_DIR/train.title.dict   -outf $OUT_DIR/train.title.dict.torch
16 | 
17 | echo "-- Creating data directories."
18 | mkdir -p $OUT_DIR/train/title
19 | mkdir -p $OUT_DIR/train/article
20 | 
21 | mkdir -p $OUT_DIR/valid.filter/title
22 | mkdir -p $OUT_DIR/valid.filter/article
23 | 
24 | cp $OUT_DIR/train.title.dict.torch $OUT_DIR/train/title/dict
25 | cp $OUT_DIR/train.article.dict.torch $OUT_DIR/train/article/dict
26 | 
27 | 
28 | echo "-- Build the matrices"
29 | 
30 | # Share the dictionary.
31 | th $SCRIPTS/build.lua -inArticleDictionary $OUT_DIR/train.article.dict.torch -inTitleDictionary $OUT_DIR/train.title.dict.torch -inTitleFile $DATA_DIR/valid.title.filter.txt -outTitleDirectory $OUT_DIR/valid.filter/title/ -inArticleFile $DATA_DIR/valid.article.filter.txt -outArticleDirectory $OUT_DIR/valid.filter/article/ -window $WINDOW
32 | 
33 | th $SCRIPTS/build.lua -inArticleDictionary $OUT_DIR/train.article.dict.torch -inTitleDictionary $OUT_DIR/train.title.dict.torch -inTitleFile $DATA_DIR/train.title.txt  -outTitleDirectory $OUT_DIR/train/title/ -inArticleFile $DATA_DIR/train.article.txt -outArticleDirectory $OUT_DIR/train/article/ -window $WINDOW
34 | 


--------------------------------------------------------------------------------
/summary/beam_search.lua:
--------------------------------------------------------------------------------
  1 | --
  2 | --  Copyright (c) 2015, Facebook, Inc.
  3 | --  All rights reserved.
  4 | --
  5 | --  This source code is licensed under the BSD-style license found in the
  6 | --  LICENSE file in the root directory of this source tree. An additional grant
  7 | --  of patent rights can be found in the PATENTS file in the same directory.
  8 | --
  9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
 10 | --          Sumit Chopra <spchopra@fb.com>
 11 | --          Jason Weston <jase@fb.com>
 12 | 
 13 | -- A beam search decoder
 14 | local data     = require('summary.data')
 15 | local features = require('summary.features')
 16 | local util     = require('summary.util')
 17 | 
 18 | local beam = {}
 19 | local INF = 1e9
 20 | 
 21 | function beam.addOpts(cmd)
 22 |    cmd:option('-allowUNK',         false, "Allow generating <unk>.")
 23 |    cmd:option('-fixedLength',      true,  "Produce exactly -length words.")
 24 |    cmd:option('-blockRepeatWords', false, "Disallow generating a word twice.")
 25 |    cmd:option('-lmWeight',           1.0, "Weight for main model.")
 26 |    cmd:option('-beamSize',           100, "Size of the beam.")
 27 |    cmd:option('-extractive',       false, "Force fully extractive summary.")
 28 |    cmd:option('-abstractive',      false, "Force fully abstractive summary.")
 29 |    cmd:option('-recombine',        false, "Used hypothesis recombination.")
 30 |    features.addOpts(cmd)
 31 | end
 32 | 
 33 | function beam.init(opt, mlp, aux_model, article_to_title, dict)
 34 |    local new_beam = {}
 35 |    setmetatable(new_beam, { __index = beam })
 36 |    new_beam.opt = opt
 37 |    new_beam.K   = opt.beamSize
 38 |    new_beam.mlp = mlp
 39 |    new_beam.aux_model = aux_model
 40 |    new_beam.article_to_title = article_to_title
 41 |    new_beam.dict = dict
 42 | 
 43 |    -- Special Symbols.
 44 |    new_beam.UNK   = dict.symbol_to_index["<unk>"]
 45 |    new_beam.START = dict.symbol_to_index["<s>"]
 46 |    new_beam.END   = dict.symbol_to_index["</s>"]
 47 | 
 48 |    return new_beam
 49 | end
 50 | 
 51 | -- Helper: convert flat index to matrix.
 52 | local function flat_to_rc(v, indices, flat_index)
 53 |    local row = math.floor((flat_index - 1) / v:size(2)) + 1
 54 |    return row, indices[row][(flat_index - 1) % v:size(2) + 1]
 55 | end
 56 | 
 57 | -- Helper: find kmax of vector.
 58 | local function find_k_max(pool, mat)
 59 |    local v = pool:forward(mat:t()):t()
 60 |    local orig_indices = pool.indices:t():add(1)
 61 |    return v:contiguous(), orig_indices
 62 | end
 63 | 
 64 | -- Use beam search to generate a summary of
 65 | -- the article of length <= len.
 66 | function beam:generate(article, len)
 67 |    local n = len
 68 |    local K = self.K
 69 |    local W = self.opt.window
 70 | 
 71 |    -- Initialize the extractive features.
 72 |    local feat_gen = features.init(self.opt, self.article_to_title)
 73 |    feat_gen:match_words(self.START, article)
 74 |    local F = feat_gen.num_features
 75 |    local FINAL_VAL = 1000
 76 | 
 77 |    -- Initilize the charts.
 78 |    -- scores[i][k] is the log prob of the k'th hyp of i words.
 79 |    -- hyps[i][k] contains the words in k'th hyp at
 80 |    --          i word (left padded with W <s>) tokens.
 81 |    -- feats[i][k][f] contains the feature count of
 82 |    --               the f features for the k'th hyp at word i.
 83 |    local result = {}
 84 |    local scores = torch.zeros(n+1, K):float()
 85 |    local hyps = torch.zeros(n+1, K, W+n+1):long()
 86 |    local feats = torch.zeros(n+1, K, F):float()
 87 |    hyps:fill(self.START)
 88 | 
 89 |    -- Initilialize used word set.
 90 |    -- words_used[i][k] is a set of the words used in the i,k hyp.
 91 |    local words_used = {}
 92 |    if self.opt.blockRepeatWords then
 93 |       for i = 1, n + 1 do
 94 |          words_used[i] = {}
 95 |          for k = 1, K do
 96 |             words_used[i][k] = {}
 97 |          end
 98 |       end
 99 |    end
100 | 
101 |    -- Find k-max columns of a matrix.
102 |    -- Use 2*k in case some are invalid.
103 |    local pool = nn.TemporalKMaxPooling(2*K)
104 | 
105 |    -- Main loop of beam search.
106 |    for i = 1, n do
107 |       local cur_beam = hyps[i]:narrow(2, i+1, W)
108 |       local cur_K = K
109 | 
110 |       -- (1) Score all next words for each context in the beam.
111 |       --    log p(y_{i+1} | y_c, x) for all y_c
112 |       local input = data.make_input(article, cur_beam, cur_K)
113 |       local model_scores = self.mlp:forward(input)
114 | 
115 |       local out = model_scores:clone():double()
116 |       out:mul(self.opt.lmWeight)
117 | 
118 |       -- If length limit is reached, next word must be end.
119 |       local finalized = (i == n) and self.opt.fixedLength
120 |       if finalized then
121 |          out[{{}, self.END}]:add(FINAL_VAL)
122 |       else
123 |          -- Apply hard constraints.
124 |          out[{{}, self.START}] = -INF
125 |          if not self.opt.allowUNK then
126 |             out[{{}, self.UNK}] = -INF
127 |          end
128 |          if self.opt.fixedLength then
129 |             out[{{}, self.END}] = -INF
130 |          end
131 | 
132 |          -- Add additional extractive features.
133 |          feat_gen:add_features(out, cur_beam)
134 |       end
135 | 
136 |       -- Only take first row when starting out.
137 |       if i == 1 then
138 |          cur_K = 1
139 |          out = out:narrow(1, 1, 1)
140 |          model_scores = model_scores:narrow(1, 1, 1)
141 |       end
142 | 
143 |       -- Prob of summary is log p + log p(y_{i+1} | y_c, x)
144 |       for k = 1, cur_K do
145 |          out[k]:add(scores[i][k])
146 |       end
147 | 
148 |       -- (2) Retain the K-best words for each hypothesis using GPU.
149 |       -- This leaves a KxK matrix which we flatten to a K^2 vector.
150 |       local max_scores, mat_indices = find_k_max(pool, out:cuda())
151 |       local flat = max_scores:view(max_scores:size(1)
152 |                                       * max_scores:size(2)):float()
153 | 
154 |       -- 3) Construct the next hypotheses by taking the next k-best.
155 |       local seen_ngram = {}
156 |       for k = 1, K do
157 |          for _ = 1, 100 do
158 | 
159 |             -- (3a) Pull the score, index, rank, and word of the
160 |             -- current best in the table, and then zero it out.
161 |             local score, index = flat:max(1)
162 |             if finalized then
163 |                score[1] = score[1] - FINAL_VAL
164 |             end
165 |             scores[i+1][k] = score[1]
166 |             local prev_k, y_i1 = flat_to_rc(max_scores, mat_indices, index[1])
167 |             flat[index[1]] = -INF
168 | 
169 |             -- (3b) Is this a valid next word?
170 |             local blocked = (self.opt.blockRepeatWords and
171 |                                 words_used[i][prev_k][y_i1])
172 | 
173 |             blocked = blocked or
174 |                (self.opt.extractive and not feat_gen:has_ngram({y_i1}))
175 |             blocked = blocked or
176 |                (self.opt.abstractive and feat_gen:has_ngram({y_i1}))
177 | 
178 |             -- Hypothesis recombination.
179 |             local new_context = {}
180 |             if self.opt.recombine then
181 |                for j = i+2, i+W do
182 |                   table.insert(new_context, hyps[i][prev_k][j])
183 |                end
184 |                table.insert(new_context, y_i1)
185 |                blocked = blocked or util.has(seen_ngram, new_context)
186 |             end
187 | 
188 |             -- (3c) Add the word, its score, and its features to the
189 |             -- beam.
190 |             if not blocked then
191 |                -- Update tables with new hypothesis.
192 |                for j = 1, i+W do
193 |                   local pword = hyps[i][prev_k][j]
194 |                   hyps[i+1][k][j] = pword
195 |                   words_used[i+1][k][pword] = true
196 |                end
197 |                hyps[i+1][k][i+W+1] = y_i1
198 |                words_used[i+1][k][y_i1] = true
199 | 
200 |                -- Keep track of hypotheses seen.
201 |                if self.opt.recombine then
202 |                   util.add(seen_ngram, new_context)
203 |                end
204 | 
205 |                -- Keep track of features used (For MERT)
206 |                feats[i+1][k]:copy(feats[i][prev_k])
207 |                feat_gen:compute(feats[i+1][k], hyps[i+1][k],
208 |                                 model_scores[prev_k][y_i1], y_i1, i)
209 | 
210 |                -- If we have produced an END symbol, push to stack.
211 |                if y_i1 == self.END then
212 |                   table.insert(result, {i+1, scores[i+1][k],
213 |                                         hyps[i+1][k]:clone(),
214 |                                         feats[i+1][k]:clone()})
215 |                   scores[i+1][k] = -INF
216 |                end
217 |                break
218 |             end
219 |          end
220 |       end
221 |    end
222 | 
223 |    -- Sort by score.
224 |    table.sort(result, function (a, b) return a[2] > b[2] end)
225 | 
226 |    -- Return the scores and hypotheses at the final stage.
227 |    return result
228 | end
229 | 
230 | 
231 | return beam
232 | 


--------------------------------------------------------------------------------
/summary/build.lua:
--------------------------------------------------------------------------------
  1 | --
  2 | --  Copyright (c) 2015, Facebook, Inc.
  3 | --  All rights reserved.
  4 | --
  5 | --  This source code is licensed under the BSD-style license found in the
  6 | --  LICENSE file in the root directory of this source tree. An additional grant
  7 | --  of patent rights can be found in the PATENTS file in the same directory.
  8 | --
  9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
 10 | --          Sumit Chopra <spchopra@fb.com>
 11 | --          Jason Weston <jase@fb.com>
 12 | 
 13 | -- Script to build the dataset
 14 | require('torch')
 15 | local utils = require('summary/util')
 16 | 
 17 | torch.setdefaulttensortype('torch.LongTensor')
 18 | 
 19 | cmd = torch.CmdLine()
 20 | cmd:text()
 21 | cmd:text()
 22 | cmd:text('Build torch serialized version of a summarization problem.')
 23 | cmd:text()
 24 | 
 25 | cmd:option('-window', 5, 'The ngram window to use.')
 26 | 
 27 | cmd:option('-inTitleFile', '',       'The input file.')
 28 | cmd:option('-inTitleDictionary', '', 'The input dictionary.')
 29 | cmd:option('-outTitleDirectory', '', 'The output directory.')
 30 | cmd:option('-inArticleFile', '',     'The input file.')
 31 | cmd:option('-inArticleDictionary', '', 'The input dictionary.')
 32 | cmd:option('-outArticleDirectory', '', 'The output directory.')
 33 | 
 34 | opt = cmd:parse(arg)
 35 | 
 36 | local function count(file, aligned_lengths, pad)
 37 |    -- Count up properties of the input file.
 38 |    local f = io.open(file, 'r')
 39 |    local counter = {
 40 |       nsents = 0,
 41 |       max_length = 0,
 42 |       aligned_lengths = {},
 43 |       line_lengths = {},
 44 |       bucket_words = {}}
 45 |    local nline = 1
 46 |    for l in f:lines() do
 47 |       local true_l = l
 48 |       if pad then
 49 |          true_l = "<s> <s> <s> " .. l .. " </s> </s> </s>"
 50 |       end
 51 |       local line = utils.string_split(true_l, " ")
 52 |       counter.line_lengths[#line] = (counter.line_lengths[#line] or 0) + 1
 53 |       counter.nsents = counter.nsents + 1
 54 |       counter.aligned_lengths[nline] = #line
 55 |       if aligned_lengths ~= nil then
 56 |          -- Add extra for implicit </s>.
 57 |          counter.bucket_words[aligned_lengths[nline]] =
 58 |             (counter.bucket_words[aligned_lengths[nline]] or 0)
 59 |             + #line + 1
 60 |       end
 61 |       nline = nline + 1
 62 |    end
 63 |    return counter
 64 | end
 65 | 
 66 | 
 67 | local function build_article_matrices(dict, file, nsents, line_lengths)
 68 |    -- For each length bucket, construct a #sentence x length matrix
 69 |    -- of word forms.
 70 |    local f = io.open(file, 'r')
 71 | 
 72 |    -- One matrix for each length.
 73 |    local mat = {}
 74 | 
 75 |    -- Number of sentences seen of this length.
 76 |    local of_length = {}
 77 | 
 78 |    for length, count in pairs(line_lengths) do
 79 |       mat[length] = torch.zeros(count, length):long()
 80 |       of_length[length] = 1
 81 |    end
 82 | 
 83 |    -- For each sentence.
 84 |    -- Col 1 is its length bin.
 85 |    -- Col 2 is its position in bin.
 86 |    local pos = torch.zeros(nsents, 2):long()
 87 | 
 88 |    local nsent = 1
 89 |    for l in f:lines() do
 90 |       local true_l = "<s> <s> <s> " .. l .. " </s> </s> </s>"
 91 |       local line = utils.string_split(true_l, " ")
 92 |       local length = #line
 93 |       local nbin = of_length[length]
 94 |       for j = 1, #line do
 95 |          local index = dict.symbol_to_index[line[j]] or 1
 96 |          --assert(index ~= nil)
 97 |          mat[length][nbin][j] = index
 98 |       end
 99 |       pos[nsent][1] = length
100 |       pos[nsent][2] = nbin
101 |       of_length[length] = nbin + 1
102 |       nsent = nsent + 1
103 |    end
104 |    return mat, pos
105 | end
106 | 
107 | 
108 | local function build_title_matrices(dict, file, aligned_lengths,
109 |                                     bucket_sizes, window)
110 |    -- For each article length bucket construct a num-words x 1 flat vector
111 |    -- of word forms and a corresponding num-words x window matrix of
112 |    -- context forms.
113 |    local nsent = 1
114 |    local pos = {}
115 | 
116 |    -- One matrix for each length.
117 |    local mat = {}
118 |    local ngram = {}
119 | 
120 |    -- Number of sentences seen of this length.
121 |    local sent_of_length = {}
122 |    local words_of_length = {}
123 | 
124 |    -- Initialize.
125 |    for length, count in pairs(bucket_sizes) do
126 |       mat[length] = torch.zeros(count, 3):long()
127 |       sent_of_length[length] = 1
128 |       words_of_length[length] = 1
129 |       ngram[length] = torch.zeros(count, window):long()
130 |    end
131 | 
132 |    -- Columns are the preceding window.
133 |    local nline = 1
134 |    local f = io.open(file, 'r')
135 |    for l in f:lines() do
136 |       -- Add implicit </s>.
137 |       local true_l = l .. " </s>"
138 |       local line = utils.string_split(true_l, " ")
139 | 
140 |       local last = {}
141 |       -- Initialize window as START symbol.
142 |       for w = 1, window do
143 |          table.insert(last, dict.symbol_to_index["<s>"])
144 |       end
145 | 
146 |       local aligned_length = aligned_lengths[nline]
147 |       for j = 1, #line do
148 |          local nword = words_of_length[aligned_length]
149 |          local index = dict.symbol_to_index[line[j]] or 1
150 | 
151 |          mat[aligned_length][nword][1] = index
152 |          mat[aligned_length][nword][2] = sent_of_length[aligned_length]
153 |          mat[aligned_length][nword][3] = j
154 | 
155 |          -- Move the window forward.
156 |          for w = 1, window-1 do
157 |             ngram[aligned_length][nword][w] = last[w]
158 |             last[w] = last[w+1]
159 |          end
160 |          ngram[aligned_length][nword][window] = last[window]
161 |          last[window] = index
162 |          words_of_length[aligned_length] = words_of_length[aligned_length] + 1
163 |       end
164 |       sent_of_length[aligned_length] = sent_of_length[aligned_length] + 1
165 |       nsent = nsent + 1
166 | 
167 |       -- Debug logging.
168 |       if nsent % 100000 == 1 then
169 |          print(nsent)
170 |       end
171 |       nline = nline + 1
172 |    end
173 |    return mat, pos, ngram
174 | end
175 | 
176 | local function main()
177 |    local counter = count(opt.inArticleFile, nil, true)
178 |    local dict = torch.load(opt.inArticleDictionary)
179 | 
180 |    -- Construct a rectangular word matrix.
181 |    local word_mat, offset_mat =
182 |       build_article_matrices(dict, opt.inArticleFile,
183 |                              counter.nsents, counter.line_lengths)
184 |    torch.save(opt.outArticleDirectory .. '/word.mat.torch', word_mat)
185 |    torch.save(opt.outArticleDirectory .. '/offset.mat.torch', offset_mat)
186 | 
187 |    local title_counter = count(opt.inTitleFile, counter.aligned_lengths, false)
188 |    local title_dict = torch.load(opt.inTitleDictionary)
189 | 
190 |    -- Construct a 1d word matrix.
191 |    local word_mat, offset_mat, ngram_mat =
192 |       build_title_matrices(title_dict,
193 |                            opt.inTitleFile,
194 |                            counter.aligned_lengths,
195 |                            title_counter.bucket_words,
196 |                            opt.window)
197 |    torch.save(opt.outTitleDirectory .. '/word.mat.torch', word_mat)
198 |    torch.save(opt.outTitleDirectory .. '/offset.mat.torch', offset_mat)
199 |    torch.save(opt.outTitleDirectory .. '/ngram.mat.torch', ngram_mat)
200 | end
201 | 
202 | main()
203 | 


--------------------------------------------------------------------------------
/summary/build_dict.lua:
--------------------------------------------------------------------------------
 1 | --
 2 | --  Copyright (c) 2015, Facebook, Inc.
 3 | --  All rights reserved.
 4 | --
 5 | --  This source code is licensed under the BSD-style license found in the
 6 | --  LICENSE file in the root directory of this source tree. An additional grant
 7 | --  of patent rights can be found in the PATENTS file in the same directory.
 8 | --
 9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | --          Sumit Chopra <spchopra@fb.com>
11 | --          Jason Weston <jase@fb.com>
12 | 
13 | -- Script to build the dictionary
14 | local utils = require('summary/util')
15 | 
16 | cmd = torch.CmdLine()
17 | cmd:text()
18 | cmd:text()
19 | cmd:text('Build torch serialized version of a dictionary file.')
20 | cmd:text()
21 | cmd:text('Options')
22 | cmd:option('-inf', '', 'The input dictionary.')
23 | cmd:option('-outf', '', 'The output directory.')
24 | cmd:text()
25 | 
26 | opt = cmd:parse(arg)
27 | 
28 | local f = io.open(opt.inf, 'r')
29 | local word_id = 0
30 | local dict = {symbol_to_index = {},
31 |               index_to_symbol = {}}
32 | for l in f:lines() do
33 |    word_id = word_id + 1
34 |    local word = utils.string_split(l)[1]
35 |    dict.symbol_to_index[word] = word_id
36 |    dict.index_to_symbol[word_id] = word
37 | end
38 | torch.save(opt.outf, dict)
39 | 


--------------------------------------------------------------------------------
/summary/data.lua:
--------------------------------------------------------------------------------
  1 | --
  2 | --  Copyright (c) 2015, Facebook, Inc.
  3 | --  All rights reserved.
  4 | --
  5 | --  This source code is licensed under the BSD-style license found in the
  6 | --  LICENSE file in the root directory of this source tree. An additional grant
  7 | --  of patent rights can be found in the PATENTS file in the same directory.
  8 | --
  9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
 10 | --          Sumit Chopra <spchopra@fb.com>
 11 | --          Jason Weston <jase@fb.com>
 12 | 
 13 | -- Load data for summary experiments.
 14 | local util = require('summary/util')
 15 | 
 16 | local data = {}
 17 | 
 18 | function data.add_opts(cmd)
 19 |    cmd:option('-articleDir', '',
 20 |               'Directory containing article training matrices.')
 21 |    cmd:option('-titleDir', '',
 22 |               'Directory containing title training matrices.')
 23 |    cmd:option('-validArticleDir', '',
 24 |               'Directory containing article matricess for validation.')
 25 |    cmd:option('-validTitleDir', '',
 26 |               'Directory containing title matrices for validation.')
 27 | end
 28 | 
 29 | function data.load(article_dir, title_dir)
 30 |    return data.init()
 31 | end
 32 | 
 33 | function data.init(title_data, article_data)
 34 |    local new_data = {}
 35 |    setmetatable(new_data, { __index = data })
 36 |    new_data.title_data = title_data
 37 |    new_data.article_data = article_data
 38 |    new_data:reset()
 39 |    return new_data
 40 | end
 41 | 
 42 | function data:reset()
 43 |    self.bucket_order = {}
 44 |    for length, _ in pairs(self.title_data.target) do
 45 |       table.insert(self.bucket_order, length)
 46 |    end
 47 |    util.shuffleTable(self.bucket_order)
 48 |    self.bucket_index = 0
 49 |    self:load_next_bucket()
 50 | end
 51 | 
 52 | function data:load_next_bucket()
 53 |    self.done_bucket = false
 54 |    self.bucket_index = self.bucket_index + 1
 55 |    self.bucket = self.bucket_order[self.bucket_index]
 56 |    self.bucket_size = self.title_data.target[self.bucket]:size(1)
 57 |    self.pos = 1
 58 |    self.aux_ptrs = self.title_data.sentences[self.bucket]:float():long()
 59 |    self.positions = torch.range(1, self.bucket):view(1, self.bucket)
 60 |       :expand(1000, self.bucket):contiguous():cuda() + (200 * self.bucket)
 61 | end
 62 | 
 63 | function data:is_done()
 64 |    return self.bucket_index >= #self.bucket_order - 1 and
 65 |       self.done_bucket
 66 | end
 67 | 
 68 | function data:next_batch(max_size)
 69 |    local diff = self.bucket_size - self.pos
 70 |    if self.done_bucket or diff == 0 or diff == 1 then
 71 |       self:load_next_bucket()
 72 |    end
 73 |    local offset
 74 |    if self.pos + max_size > self.bucket_size then
 75 |       offset = self.bucket_size - self.pos
 76 |       self.done_bucket = true
 77 |    else
 78 |       offset = max_size
 79 |    end
 80 |    local positions = self.positions:narrow(1, 1, offset)
 81 | 
 82 |    local aux_rows = self.article_data.words[self.bucket]:
 83 |       index(1, self.aux_ptrs:narrow(1, self.pos, offset))
 84 |    local context = self.title_data.ngram[self.bucket]
 85 |       :narrow(1, self.pos, offset)
 86 |    local target = self.title_data.target[self.bucket]
 87 |       :narrow(1, self.pos, offset)
 88 |    self.pos = self.pos + offset
 89 |    return {aux_rows, positions, context}, target
 90 | end
 91 | 
 92 | function data.make_input(article, context, K)
 93 |    local bucket = article:size(1)
 94 |    local aux_sentence = article:view(bucket, 1)
 95 |       :expand(article:size(1), K):t():contiguous():cuda()
 96 |    local positions = torch.range(1, bucket):view(bucket, 1)
 97 |       :expand(bucket, K):t():contiguous():cuda() + (200 * bucket)
 98 |    return {aux_sentence, positions, context}
 99 | end
100 | 
101 | function data.load_title_dict(dname)
102 |    return torch.load(dname .. 'dict')
103 | end
104 | 
105 | function data.load_title(dname, shuffle, use_dict)
106 |    local ngram = torch.load(dname .. 'ngram.mat.torch')
107 |    local words = torch.load(dname .. 'word.mat.torch')
108 |    local dict = use_dict or torch.load(dname .. 'dict')
109 |    local target_full = {}
110 |    local sentences_full = {}
111 |    local pos_full = {}
112 |    for length, mat in pairs(ngram) do
113 |       if shuffle ~= nil then
114 |          local perm = torch.randperm(ngram[length]:size(1)):long()
115 |          ngram[length] = ngram[length]:index(1, perm):float():cuda()
116 |          words[length] = words[length]:index(1, perm)
117 |       else
118 |          ngram[length] = ngram[length]:float():cuda()
119 |       end
120 |       assert(ngram[length]:size(1) == words[length]:size(1))
121 |       target_full[length] = words[length][{{}, 1}]:contiguous():float():cuda()
122 |       sentences_full[length] =
123 |          words[length][{{}, 2}]:contiguous():float():cuda()
124 |       pos_full[length] = words[length][{{}, 3}]
125 | 
126 |    end
127 |    local title_data = {ngram = ngram,
128 |                        target = target_full,
129 |                        sentences = sentences_full,
130 |                        pos = pos_full,
131 |                        dict = dict}
132 |    return title_data
133 | end
134 | 
135 | function data.load_article(dname, use_dict)
136 |    local input_words = torch.load(dname .. 'word.mat.torch')
137 |    -- local offsets = torch.load(dname .. 'offset.mat.torch')
138 | 
139 |    local dict = use_dict or torch.load(dname .. 'dict')
140 |    for length, mat in pairs(input_words) do
141 |       input_words[length] = mat
142 |       input_words[length] = input_words[length]:float():cuda()
143 |    end
144 |    local article_data = {words = input_words, dict = dict}
145 |    return article_data
146 | end
147 | 
148 | return data
149 | 


--------------------------------------------------------------------------------
/summary/encoder.lua:
--------------------------------------------------------------------------------
  1 | --
  2 | --  Copyright (c) 2015, Facebook, Inc.
  3 | --  All rights reserved.
  4 | --
  5 | --  This source code is licensed under the BSD-style license found in the
  6 | --  LICENSE file in the root directory of this source tree. An additional grant
  7 | --  of patent rights can be found in the PATENTS file in the same directory.
  8 | --
  9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
 10 | --          Sumit Chopra <spchopra@fb.com>
 11 | --          Jason Weston <jase@fb.com>
 12 | 
 13 | -- require('fbcunn')
 14 | 
 15 | local encoder = {}
 16 | 
 17 | function encoder.add_opts(cmd)
 18 |    cmd:option('-encoderModel', 'bow', "The encoder model to use.")
 19 |    cmd:option('-bowDim',      50, "Article embedding size.")
 20 |    cmd:option('-attenPool',    5, "Attention model pooling size.")
 21 |    cmd:option('-hiddenUnits', 1000, "Conv net encoder hidden units.")
 22 |    cmd:option('-kernelWidth', 5,    "Conv net encoder kernel width.")
 23 | end
 24 | 
 25 | 
 26 | function encoder.build(opt, data)
 27 |    torch.setdefaulttensortype("torch.CudaTensor")
 28 |    local model = nil
 29 |    if opt.encoderModel == "none" then
 30 |       model = encoder.build_blank_model(opt, data)
 31 |    elseif opt.encoderModel == "bow" then
 32 |       model =  encoder.build_bow_model(opt, data)
 33 |    elseif opt.encoderModel == "attenbow" then
 34 |       model = encoder.build_attnbow_model(opt, data)
 35 |    elseif opt.encoderModel == "conv" then
 36 |       model = encoder.build_conv_model(opt, data)
 37 |    end
 38 |    torch.setdefaulttensortype("torch.DoubleTensor")
 39 |    return model
 40 | end
 41 | 
 42 | 
 43 | function encoder.build_blank_model(opt, data)
 44 |    -- Ignores the article layer entirely (acts like LM).
 45 |    local lookup = nn.Identity()()
 46 |    local ignore1 = nn.Identity()()
 47 |    local ignore2 = nn.Identity()()
 48 |    local start = nn.SelectTable(3)({lookup, ignore1, ignore2})
 49 | 
 50 |    local mout = nn.MulConstant(0)(start)
 51 |    local encoder_mlp = nn.gModule({lookup, ignore1, ignore2}, {mout})
 52 |    encoder_mlp:cuda()
 53 |    return encoder_mlp
 54 | end
 55 | 
 56 | 
 57 | function encoder.build_bow_model(opt, data)
 58 |    print("Encoder model: Bag-of-Words")
 59 | 
 60 |    -- BOW with mean on article.
 61 |    local lookup = nn.LookupTable(
 62 |       #data.article_data.dict.index_to_symbol,
 63 |       opt.bowDim)()
 64 | 
 65 |    -- Ignore the context.
 66 |    local ignore1 = nn.Identity()()
 67 |    local ignore2 = nn.Identity()()
 68 | 
 69 |    -- Ignores the context and position input.
 70 |    local start = nn.SelectTable(1)({lookup, ignore1, ignore2})
 71 |    local mout = nn.Linear(opt.bowDim, opt.bowDim)(
 72 |       nn.Mean(3)(nn.Transpose({2, 3})(start)))
 73 | 
 74 |    local encoder_mlp = nn.gModule({lookup, ignore1, ignore2}, {mout})
 75 |    encoder_mlp:cuda()
 76 | 
 77 |    return encoder_mlp
 78 | end
 79 | 
 80 | 
 81 | function encoder.build_conv_model(opt, data)
 82 |    -- Three layer thin convolutional architecture.
 83 |    print("Encoder model: Conv")
 84 |    local V2 = #data.article_data.dict.index_to_symbol
 85 |    local nhid = opt.hiddenUnits
 86 | 
 87 |    -- Article embedding.
 88 |    local article_lookup = nn.LookupTable(V2, nhid)()
 89 | 
 90 |    -- Ignore the context.
 91 |    local ignore1 = nn.Identity()()
 92 |    local ignore2 = nn.Identity()()
 93 |    local start = nn.SelectTable(1)({article_lookup, ignore1, ignore2})
 94 |    local kwidth = opt.kernelWidth
 95 |    local model = nn.Sequential()
 96 |    model:add(nn.View(1, -1, nhid):setNumInputDims(2))
 97 |    model:add(cudnn.SpatialConvolution(1, nhid, nhid, kwidth, 1, 1, 0))
 98 |    model:add(cudnn.SpatialMaxPooling(1, 2, 1, 2))
 99 |    model:add(nn.Threshold())
100 |    model:add(nn.Transpose({2,4}))
101 | 
102 |    -- layer 2
103 |    model:add(cudnn.SpatialConvolution(1, nhid, nhid, kwidth, 1, 1, 0))
104 |    model:add(nn.Threshold())
105 |    model:add(nn.Transpose({2,4}))
106 | 
107 |    -- layer 3
108 |    model:add(cudnn.SpatialConvolution(1, nhid, nhid, kwidth, 1, 1, 0))
109 |    model:add(nn.View(nhid, -1):setNumInputDims(3))
110 |    model:add(nn.Max(3))
111 |    local done = nn.View(opt.hiddenUnits)(model(start))
112 | 
113 |    local mout = nn.Linear(opt.hiddenUnits, opt.embeddingDim)(done)
114 | 
115 |    local encoder_mlp = nn.gModule({article_lookup, ignore1, ignore2}, {mout})
116 |    encoder_mlp.lookup = article_lookup.data.module
117 |    encoder_mlp:cuda()
118 |    return encoder_mlp
119 | end
120 | 
121 | 
122 | function encoder.build_attnbow_model(opt, data)
123 |    print("Encoder model: BoW + Attention")
124 | 
125 |    local D2 = opt.bowDim
126 |    local N = opt.window
127 |    local V = #data.title_data.dict.index_to_symbol
128 |    local V2 = #data.article_data.dict.index_to_symbol
129 | 
130 |    -- Article Embedding.
131 |    local article_lookup = nn.LookupTable(V2, D2)()
132 | 
133 |    -- Title Embedding.
134 |    local title_lookup = nn.LookupTable(V, D2)()
135 | 
136 |    -- Size Lookup
137 |    local size_lookup = nn.Identity()()
138 | 
139 |    -- Ignore size lookup to make NNGraph happy.
140 |    local article_context = nn.SelectTable(1)({article_lookup, size_lookup})
141 | 
142 |    -- Pool article
143 |    local pad = (opt.attenPool - 1) / 2
144 |    local article_match = article_context
145 | 
146 |    -- Title context embedding.
147 |    local title_context = nn.View(D2, 1)(
148 |       nn.Linear(N * D2, D2)(nn.View(N * D2)(title_lookup)))
149 | 
150 |    -- Attention layer. Distribution over article.
151 |    local dot_article_context = nn.MM()({article_match,
152 |                                         title_context})
153 | 
154 |    -- Compute the attention distribution.
155 |    local non_linearity = nn.SoftMax()
156 |    local attention = non_linearity(nn.Sum(3)(dot_article_context))
157 | 
158 |    local process_article =
159 |       nn.Sum(2)(nn.SpatialSubSampling(1, 1, opt.attenPool)(
160 |                    nn.SpatialZeroPadding(0, 0, pad, pad)(
161 |                       nn.View(1, -1, D2):setNumInputDims(2)(article_context))))
162 | 
163 |    -- Apply attention to the subsampled article.
164 |    local mout = nn.Linear(D2, D2)(
165 |       nn.Sum(3)(nn.MM(true, false)(
166 |                    {process_article,
167 |                     nn.View(-1, 1):setNumInputDims(1)(attention)})))
168 | 
169 |    -- Apply attention
170 |    local encoder_mlp = nn.gModule({article_lookup, size_lookup, title_lookup},
171 |       {mout})
172 | 
173 |    encoder_mlp:cuda()
174 |    encoder_mlp.lookup = article_lookup.data.module
175 |    encoder_mlp.title_lookup = title_lookup.data.module
176 |    return encoder_mlp
177 | end
178 | 
179 | return encoder
180 | 


--------------------------------------------------------------------------------
/summary/features.lua:
--------------------------------------------------------------------------------
  1 | --
  2 | --  Copyright (c) 2015, Facebook, Inc.
  3 | --  All rights reserved.
  4 | --
  5 | --  This source code is licensed under the BSD-style license found in the
  6 | --  LICENSE file in the root directory of this source tree. An additional grant
  7 | --  of patent rights can be found in the PATENTS file in the same directory.
  8 | --
  9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
 10 | --          Sumit Chopra <spchopra@fb.com>
 11 | --          Jason Weston <jase@fb.com>
 12 | 
 13 | local util     = require('summary.util')
 14 | 
 15 | local features = {}
 16 | 
 17 | function features.addOpts(cmd)
 18 |    cmd:option('-lmWeight',     1.0, "Feature weight for the neural model.")
 19 |    cmd:option('-unigramBonus', 0.0, "Feature weight for unigram extraction.")
 20 |    cmd:option('-bigramBonus',  0.0, "Feature weight for bigram extraction.")
 21 |    cmd:option('-trigramBonus', 0.0, "Feature weight for trigram extraction.")
 22 |    cmd:option('-lengthBonus',  0.0, "Feature weight for length.")
 23 |    cmd:option('-unorderBonus', 0.0, "Feature weight for out-of-order.")
 24 | end
 25 | 
 26 | -- Feature positions.
 27 | local NNLM = 1
 28 | local UNI  = 2
 29 | local BI   = 3
 30 | local TRI  = 4
 31 | local OO   = 5
 32 | local LEN  = 6
 33 | 
 34 | local kFeat = 6
 35 | 
 36 | function features.init(opt, article_to_title)
 37 |    local new_features = {}
 38 |    setmetatable(new_features, { __index = features })
 39 |    new_features.opt = opt
 40 |    new_features.num_features = kFeat
 41 |    new_features.article_to_title = article_to_title
 42 |    return new_features
 43 | end
 44 | 
 45 | -- Helper: Are words in article.
 46 | function features:has_ngram(words)
 47 |    return util.has(self.ngrams[#words], words)
 48 | end
 49 | 
 50 | -- Augment the feature count based on the new word.
 51 | function features:compute(f_new, hyp, out_score, y_i1, i)
 52 |    local W = self.opt.window
 53 | 
 54 |    -- LM Features.
 55 |    f_new[NNLM] = f_new[NNLM] + out_score
 56 | 
 57 |    if self:has_ngram({y_i1}) then
 58 |       f_new[UNI] = f_new[UNI] + 1
 59 |    end
 60 | 
 61 |    if self:has_ngram({hyp[i+W], y_i1}) then
 62 |       f_new[BI] = f_new[BI] + 1
 63 |    end
 64 | 
 65 |    if self:has_ngram({hyp[i+W-1], hyp[i+W], y_i1}) then
 66 |       f_new[TRI] = f_new[TRI] + 1
 67 |    end
 68 | 
 69 |    if self.ooordered_ngram[hyp[i+W]] ~= nil and
 70 |       self.ooordered_ngram[hyp[i+W]][y_i1] ~= nil then
 71 |       f_new[OO] = f_new[OO] + 1
 72 |    end
 73 | 
 74 |    -- Length
 75 |    f_new[LEN] = f_new[LEN] + 1
 76 | end
 77 | 
 78 | -- Augment the score based on the extractive feature values.
 79 | function features:add_features(out, beam)
 80 |    local W = self.opt.window
 81 |    for k = 1, beam:size(1) do
 82 | 
 83 |       -- Exact unigram matches.
 84 |       for s, _ in pairs(self.ngrams[1]) do
 85 |          out[k][s] = out[k][s] + self.opt.unigramBonus
 86 |       end
 87 | 
 88 |       -- Exact bigram matches.
 89 |       if self.ngrams[2][beam[k][W]] ~= nil then
 90 |          for s, _ in pairs(self.ngrams[2][beam[k][W]]) do
 91 |             out[k][s] = out[k][s] + self.opt.bigramBonus
 92 |          end
 93 |       end
 94 | 
 95 |       -- Exact trigram matches.
 96 |       if self.ngrams[3][beam[k][W-1]] ~= nil and
 97 |          self.ngrams[3][beam[k][W-1]][beam[k][W]] then
 98 |             for s, _ in pairs(self.ngrams[3][beam[k][W-1]][beam[k][W]]) do
 99 |                out[k][s] = out[k][s] + self.opt.trigramBonus
100 |             end
101 |       end
102 | 
103 |       if self.ooordered_ngram[beam[k][W]] ~= nil then
104 |          for s, _ in pairs(self.ooordered_ngram[beam[k][W]]) do
105 |             out[k][s] = out[k][s] + self.opt.unorderBonus
106 |          end
107 |       end
108 |    end
109 |    out:add(self.opt.lengthBonus)
110 | end
111 | 
112 | -- Precompute extractive table based on the input article.
113 | function features:match_words(START, article)
114 |    self.ooordered_ngram = {}
115 |    local ordered_ngram = {}
116 |    self.ngrams = {{}, {}, {}}
117 |    local hist = {START, START, START, START}
118 | 
119 |    for j = 1, article:size(1) do
120 |       local tw = self.article_to_title[article[j]]
121 | 
122 |       -- Does the current word exist in title dict.
123 |       if tw ~= nil then
124 |          for j2 = 1, j do
125 |             local tw2 = self.article_to_title[article[j2]]
126 |             if tw2 ~= nil then
127 |                util.add(ordered_ngram, {tw2, tw})
128 |                if not util.has(ordered_ngram, {tw, tw2}) then
129 |                   util.add(self.ooordered_ngram, {tw, tw2})
130 |                end
131 |             end
132 |          end
133 | 
134 |          util.add(self.ngrams[1], {tw})
135 |          util.add(self.ngrams[2], {hist[3], tw})
136 |          util.add(self.ngrams[3], {hist[2], hist[3], tw})
137 |       end
138 | 
139 |       -- Advance window.
140 |       for k = 2, 4 do
141 |          hist[k-1] = hist[k]
142 |       end
143 |       hist[4] = tw
144 |    end
145 | end
146 | 
147 | return features
148 | 


--------------------------------------------------------------------------------
/summary/nnlm.lua:
--------------------------------------------------------------------------------
  1 | --
  2 | --  Copyright (c) 2015, Facebook, Inc.
  3 | --  All rights reserved.
  4 | --
  5 | --  This source code is licensed under the BSD-style license found in the
  6 | --  LICENSE file in the root directory of this source tree. An additional grant
  7 | --  of patent rights can be found in the PATENTS file in the same directory.
  8 | --
  9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
 10 | --          Sumit Chopra <spchopra@fb.com>
 11 | --          Jason Weston <jase@fb.com>
 12 | 
 13 | -- Ngram neural language model with auxiliary model
 14 | require('nn')
 15 | require('nngraph')
 16 | require('fbnn')
 17 | require('cunn')
 18 | require('sys')
 19 | local utils = require('summary.util')
 20 | 
 21 | local nnlm = {}
 22 | 
 23 | function nnlm.addOpts()
 24 |    cmd:option('-epochs',         5, "Number of epochs to train.")
 25 |    cmd:option('-miniBatchSize', 64, "Size of training minibatch.")
 26 |    cmd:option('-printEvery', 10000,  "How often to print during training.")
 27 |    cmd:option('-modelFilename', '', "File for saving loading/model.")
 28 |    cmd:option('-window',         5, "Size of NNLM window.")
 29 |    cmd:option('-embeddingDim',  50, "Size of NNLM embeddings.")
 30 |    cmd:option('-hiddenSize',   100, "Size of NNLM hiddent layer.")
 31 |    cmd:option('-learningRate', 0.1, "SGD learning rate.")
 32 | end
 33 | 
 34 | 
 35 | function nnlm.create_lm(opt, dict, encoder, encoder_size, encoder_dict)
 36 |    local new_mlp = {}
 37 |    setmetatable(new_mlp, { __index = nnlm })
 38 |    new_mlp.opt = opt
 39 |    new_mlp.dict = dict
 40 |    new_mlp.encoder_dict = encoder_dict
 41 |    new_mlp.encoder_model = encoder
 42 |    new_mlp.window = opt.window
 43 |    if encoder ~= nil then
 44 |       new_mlp:build_mlp(encoder, encoder_size)
 45 |    end
 46 |    return new_mlp
 47 | end
 48 | 
 49 | 
 50 | function nnlm:build_mlp(encoder, encoder_size)
 51 |    -- Set constants
 52 |    local D = self.opt.embeddingDim
 53 |    local N = self.opt.window
 54 |    local H = self.opt.hiddenSize
 55 |    local V = #self.dict.index_to_symbol
 56 |    local P = encoder_size
 57 |    print(H, P)
 58 | 
 59 |    -- Input
 60 |    local context_input = nn.Identity()()
 61 |    local encoder_input = nn.Identity()()
 62 |    local position_input = nn.Identity()()
 63 | 
 64 |    local lookup = nn.LookupTable(V, D)(context_input)
 65 |    local encoder_node = encoder({encoder_input, position_input, context_input})
 66 | 
 67 |    -- tanh W (E y)
 68 |    local lm_mlp = nn.Tanh()(nn.Linear(D * N, H)(nn.View(D * N)(lookup)))
 69 | 
 70 |    -- Second layer: takes LM and encoder model.
 71 |    local mlp = nn.Linear(H + P, V)(nn.View(H + P)(nn.JoinTable(2)(
 72 |                                                      {lm_mlp, encoder_node})))
 73 |    self.soft_max = nn.LogSoftMax()(mlp)
 74 | 
 75 |    -- Input is conditional context and ngram context.
 76 |    self.mlp = nn.gModule({encoder_input, position_input, context_input},
 77 |       {self.soft_max})
 78 | 
 79 |    self.criterion = nn.ClassNLLCriterion()
 80 |    self.lookup = lookup.data.module
 81 |    self.mlp:cuda()
 82 |    self.criterion:cuda()
 83 |    collectgarbage()
 84 | end
 85 | 
 86 | 
 87 | -- Run validation
 88 | function nnlm:validation(valid_data)
 89 |    print("[Running Validation]")
 90 | 
 91 |    local offset = 1000
 92 |    local loss = 0
 93 |    local total = 0
 94 | 
 95 |    valid_data:reset()
 96 |    while not valid_data:is_done() do
 97 |       local input, target = valid_data:next_batch(offset)
 98 |       local out = self.mlp:forward(input)
 99 |       local err = self.criterion:forward(out, target) * target:size(1)
100 | 
101 |       -- Augment counters.
102 |       loss = loss + err
103 |       total = total + target:size(1)
104 |    end
105 |    print(string.format("[perp: %f validation: %f total: %d]",
106 |                        math.exp(loss/total),
107 |                        loss/total, total))
108 |    return loss / total
109 | end
110 | 
111 | 
112 | function nnlm:renorm(data, th)
113 |     local size = data:size(1)
114 |     for i = 1, size do
115 |         local norm = data[i]:norm()
116 |         if norm > th then
117 |             data[i]:div(norm/th)
118 |         end
119 |     end
120 | end
121 | 
122 | 
123 | function nnlm:renorm_tables()
124 |     -- Renormalize the lookup tables.
125 |     if self.lookup ~= nil then
126 |         print(self.lookup.weight:size())
127 |         print(self.lookup.weight:type())
128 |         self:renorm(self.lookup.weight, 1)
129 |     end
130 |     if self.encoder_model.lookup ~= nil then
131 |         self:renorm(self.encoder_model.lookup.weight, 1)
132 |         if self.encoder_model.title_lookup ~= nil then
133 |             self:renorm(self.encoder_model.title_lookup.weight, 1)
134 |         end
135 |     end
136 |     if self.encoder_model.lookups ~= nil then
137 |         for i = 1, #self.encoder_model.lookups do
138 |             self:renorm(self.encoder_model.lookups[i].weight, 1)
139 |         end
140 |     end
141 | end
142 | 
143 | 
144 | function nnlm:run_valid(valid_data)
145 |    -- Run validation.
146 |    if valid_data ~= nil then
147 |       local cur_valid_loss = self:validation(valid_data)
148 |       -- If valid loss does not improve drop learning rate.
149 |       if cur_valid_loss > self.last_valid_loss then
150 |          self.opt.learningRate = self.opt.learningRate / 2
151 |       end
152 |       self.last_valid_loss = cur_valid_loss
153 |    end
154 | 
155 |    -- Save the model.
156 |    self:save(self.opt.modelFilename)
157 | end
158 | 
159 | 
160 | function nnlm:train(data, valid_data)
161 |    -- Best loss seen yet.
162 |    self.last_valid_loss = 1e9
163 |    -- Train
164 |    for epoch = 1, self.opt.epochs do
165 |       data:reset()
166 |       self:renorm_tables()
167 |       self:run_valid(valid_data)
168 | 
169 |       -- Loss for the epoch.
170 |       local epoch_loss = 0
171 |       local batch = 1
172 |       local last_batch = 1
173 |       local total = 0
174 |       local loss = 0
175 | 
176 |       sys.tic()
177 |       while not data:is_done() do
178 |          local input, target = data:next_batch(self.opt.miniBatchSize)
179 |          if data:is_done() then break end
180 | 
181 |          local out = self.mlp:forward(input)
182 |          local err = self.criterion:forward(out, target) * target:size(1)
183 |          local deriv = self.criterion:backward(out, target)
184 | 
185 |          if not utils.isnan(err) then
186 |             loss = loss + err
187 |             epoch_loss = epoch_loss + err
188 | 
189 |             self.mlp:zeroGradParameters()
190 |             self.mlp:backward(input, deriv)
191 |             self.mlp:updateParameters(self.opt.learningRate)
192 |          else
193 |             print("NaN")
194 |             print(input)
195 |          end
196 | 
197 |          -- Logging
198 |          if batch % self.opt.printEvery == 1 then
199 |             print(string.format(
200 |                      "[Loss: %f Epoch: %d Position: %d Rate: %f Time: %f]",
201 |                      loss / ((batch - last_batch) * self.opt.miniBatchSize),
202 |                      epoch,
203 |                      batch * self.opt.miniBatchSize,
204 |                      self.opt.learningRate,
205 |                      sys.toc()
206 |             ))
207 |             sys.tic()
208 |             last_batch = batch
209 |             loss = 0
210 |          end
211 | 
212 |          batch = batch + 1
213 |          total = total + input[1]:size(1)
214 |       end
215 |       print(string.format("[EPOCH : %d LOSS: %f TOTAL: %d BATCHES: %d]",
216 |                           epoch, epoch_loss / total, total, batch))
217 |    end
218 | end
219 | 
220 | 
221 | function nnlm:save(fname)
222 |     print("[saving mlp: " .. fname .. "]")
223 |     torch.save(fname, self)
224 |     return true
225 | end
226 | 
227 | 
228 | function nnlm:load(fname)
229 |     local new_self = torch.load(fname)
230 |     for k, v in pairs(new_self) do
231 |        if k ~= 'opt' then
232 |           self[k] = v
233 |        end
234 |     end
235 |     return true
236 | end
237 | 
238 | 
239 | return nnlm
240 | 


--------------------------------------------------------------------------------
/summary/run.lua:
--------------------------------------------------------------------------------
  1 | --
  2 | --  Copyright (c) 2015, Facebook, Inc.
  3 | --  All rights reserved.
  4 | --
  5 | --  This source code is licensed under the BSD-style license found in the
  6 | --  LICENSE file in the root directory of this source tree. An additional grant
  7 | --  of patent rights can be found in the PATENTS file in the same directory.
  8 | --
  9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
 10 | --          Sumit Chopra <spchopra@fb.com>
 11 | --          Jason Weston <jase@fb.com>
 12 | 
 13 | require('torch')
 14 | require('nn')
 15 | require('sys')
 16 | 
 17 | local nnlm = require('summary.nnlm')
 18 | local encoder = require('summary.encoder')
 19 | local beam = require('summary.beam_search')
 20 | local utils = require('summary.util')
 21 | 
 22 | cmd = torch.CmdLine()
 23 | 
 24 | beam.addOpts(cmd)
 25 | 
 26 | cutorch.setDevice(2)
 27 | 
 28 | cmd:option('-modelFilename', '', 'Model to test.')
 29 | cmd:option('-inputf',        '', 'Input article files. ')
 30 | cmd:option('-nbest',      false, 'Write out the nbest list in ZMert format.')
 31 | cmd:option('-length',         15, 'Maximum length of summary.')
 32 | opt = cmd:parse(arg)
 33 | 
 34 | -- Map the words from one dictionary to another.
 35 | local function sync_dicts(dict1, dict2)
 36 |    local dict_map = torch.ones(#dict1.index_to_symbol):long()
 37 |    for i = 1, #dict1.index_to_symbol do
 38 |       local res = dict2.symbol_to_index[dict1.index_to_symbol[i]]
 39 |       dict_map[i] = res or 1
 40 |    end
 41 |    return dict_map
 42 | end
 43 | 
 44 | -- Apply digit preprocessing.
 45 | local function process_word(input_word)
 46 |    local word = string.lower(input_word)
 47 |    for i = 1, word:len() do
 48 |       if word:sub(i, i) >= '0' and word:sub(i, i) <= '9' then
 49 |          word = word:sub(1, i-1) .. '#' .. word:sub(i+1)
 50 |       end
 51 |    end
 52 |    return word
 53 | end
 54 | 
 55 | local function main()
 56 |    -- Load in the dictionaries and the input files.
 57 |    local mlp = nnlm.create_lm(opt)
 58 |    mlp:load(opt.modelFilename)
 59 |    local adict = mlp.encoder_dict
 60 |    local tdict = mlp.dict
 61 | 
 62 |    local dict_map = sync_dicts(adict, tdict)
 63 |    local sent_file = assert(io.open(opt.inputf))
 64 |    local len = opt.length
 65 |    local W = mlp.window
 66 |    opt.window = W
 67 | 
 68 |    local sent_num = 0
 69 |    for line in sent_file:lines() do
 70 |       sent_num = sent_num + 1
 71 | 
 72 |       -- Add padding.
 73 |       local true_line = "<s> <s> <s> " .. line .. " </s> </s> </s>"
 74 |       local words = utils.string_split(true_line)
 75 | 
 76 |       local article = torch.zeros(#words)
 77 |       for j = 1, #words do
 78 |          local word = process_word(words[j])
 79 |          article[j] = adict.symbol_to_index[word] or
 80 |             adict.symbol_to_index["<unk>"]
 81 |       end
 82 | 
 83 |       -- Run beam search.
 84 |       local sbeam = beam.init(opt, mlp.mlp, mlp.encoder_model,
 85 |                               dict_map, tdict)
 86 |       local results = sbeam:generate(article, len)
 87 | 
 88 |       if not opt.nbest then
 89 |          if  #results ==  0 then
 90 |             io.write("*FAIL*")
 91 |          else
 92 |             -- Print out in standard format.
 93 |             local len, _, output, _ = unpack(results[1])
 94 |             local total = 0
 95 |             for j = W+2, W+len - 1 do
 96 |                local word = tdict.index_to_symbol[output[j]]
 97 |                total = total + #word + 1
 98 |                io.write(word, " " )
 99 |             end
100 |          end
101 |          print("")
102 |       else
103 |          -- Print out an nbest list in Moses/ZMert format.
104 |          for k = 1, #results do
105 |             io.write(sent_num-1, " ||| ")
106 |             local len, score, output, features = unpack(results[k])
107 |             for j = W+2, W+len - 1 do
108 |                io.write(tdict.index_to_symbol[output[j]], " " )
109 |             end
110 |             io.write(" ||| ")
111 |             for f = 1, features:size(1) do
112 |                io.write(features[f], " ")
113 |             end
114 |             io.write(" ||| ", score)
115 |             print("")
116 |          end
117 |       end
118 |    end
119 | end
120 | 
121 | main()
122 | 


--------------------------------------------------------------------------------
/summary/train.lua:
--------------------------------------------------------------------------------
 1 | --
 2 | --  Copyright (c) 2015, Facebook, Inc.
 3 | --  All rights reserved.
 4 | --
 5 | --  This source code is licensed under the BSD-style license found in the
 6 | --  LICENSE file in the root directory of this source tree. An additional grant
 7 | --  of patent rights can be found in the PATENTS file in the same directory.
 8 | --
 9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | --          Sumit Chopra <spchopra@fb.com>
11 | --          Jason Weston <jase@fb.com>
12 | 
13 | -- The top-level training script
14 | require('torch')
15 | require('nngraph')
16 | 
17 | local nnlm = require('summary.nnlm')
18 | local data = require('summary.data')
19 | local encoder  = require('summary.encoder')
20 | 
21 | cmd = torch.CmdLine()
22 | cmd:text()
23 | cmd:text()
24 | cmd:text('Train a summarization model.')
25 | cmd:text()
26 | 
27 | data.add_opts(cmd)
28 | encoder.add_opts(cmd)
29 | nnlm.addOpts(cmd)
30 | 
31 | opt = cmd:parse(arg)
32 | 
33 | local function main()
34 |    -- Load in the data.
35 |    local tdata = data.load_title(opt.titleDir, true)
36 |    local article_data = data.load_article(opt.articleDir)
37 | 
38 |    local valid_data = data.load_title(opt.validTitleDir, nil, tdata.dict)
39 |    local valid_article_data =
40 |       data.load_article(opt.validArticleDir, article_data.dict)
41 | 
42 |    -- Make main LM
43 |    local train_data = data.init(tdata, article_data)
44 |    local valid = data.init(valid_data, valid_article_data)
45 |    local encoder_mlp = encoder.build(opt, train_data)
46 |    local mlp = nnlm.create_lm(opt, tdata.dict, encoder_mlp,
47 |                               opt.bowDim, article_data.dict)
48 | 
49 |    mlp:train(train_data, valid)
50 | end
51 | 
52 | main()
53 | 


--------------------------------------------------------------------------------
/summary/util.lua:
--------------------------------------------------------------------------------
 1 | --
 2 | --  Copyright (c) 2015, Facebook, Inc.
 3 | --  All rights reserved.
 4 | --
 5 | --  This source code is licensed under the BSD-style license found in the
 6 | --  LICENSE file in the root directory of this source tree. An additional grant
 7 | --  of patent rights can be found in the PATENTS file in the same directory.
 8 | --
 9 | --  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | --          Sumit Chopra <spchopra@fb.com>
11 | --          Jason Weston <jase@fb.com>
12 | 
13 | -- The utility tool box
14 | local util = {}
15 | 
16 | function util.string_shortfloat(t)
17 |     return string.format('%2.4g', t)
18 | end
19 | 
20 | function util.shuffleTable(t)
21 |     local rand = math.random
22 |     local iterations = #t
23 |     local j
24 |     for i = iterations, 2, -1 do
25 |        j = rand(i)
26 |        t[i], t[j] = t[j], t[i]
27 |     end
28 | end
29 | 
30 | 
31 | function util.string_split(s, c)
32 |    if c==nil then c=' ' end
33 |    local t={}
34 |    while true do
35 |        local f=s:find(c)
36 |        if f==nil then
37 |            if s:len()>0 then
38 |                table.insert(t, s)
39 |            end
40 |            break
41 |        end
42 |        if f > 1 then
43 |           table.insert(t, s:sub(1,f-1))
44 |        end
45 |        s=s:sub(f+1,s:len())
46 |    end
47 |    return t
48 | end
49 | 
50 | 
51 | function util.add(tab, key)
52 |    local cur = tab
53 | 
54 |    for i = 1, #key-1 do
55 |       local new_cur = cur[key[i]]
56 |       if new_cur == nil then
57 |          cur[key[i]] = {}
58 |          new_cur = cur[key[i]]
59 |       end
60 |       cur = new_cur
61 |    end
62 |    cur[key[#key]] = true
63 | end
64 | 
65 | function util.has(tab, key)
66 |    local cur = tab
67 |    for i = 1, #key do
68 |       cur = cur[key[i]]
69 |       if cur == nil then
70 |          return false
71 |       end
72 |    end
73 |    return true
74 | end
75 | 
76 | function util.isnan(x)
77 |     return x ~= x
78 | end
79 | 
80 | return util
81 | 


--------------------------------------------------------------------------------
/test_model.sh:
--------------------------------------------------------------------------------
1 | export LUA_PATH="$LUA_PATH;?.lua"
2 | 
3 | th summary/run.lua \
4 |  -modelFilename $2 \
5 |  -inputf $1 \
6 |  -length $3 \
7 |  -blockRepeatWords 
8 | 
9 | 


--------------------------------------------------------------------------------
/train_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WINDOW=5
 4 | export OUT_DIR=$1/processed
 5 | export MDL_DIR=$1/models
 6 | 
 7 | export LUA_PATH="$LUA_PATH;$ABS/?.lua"
 8 | 
 9 | #bash $ABS/prep_torch_data.sh $2
10 | 
11 | mkdir -p $MDL_DIR
12 | 
13 | th -i $ABS/summary/train.lua -titleDir  $OUT_DIR/train/title/ \
14 |  -articleDir  $OUT_DIR/train/article/ \
15 |  -modelFilename  $MDL_DIR/$2 \
16 |  -miniBatchSize  64 \
17 |  -embeddingDim  64 \
18 |  -bowDim  200 \
19 |  -hiddenSize  64 \
20 |  -epochs  20 \
21 |  -learningRate 0.1 \
22 |  -validArticleDir  $OUT_DIR/valid.filter/article/ \
23 |  -validTitleDir  $OUT_DIR/valid.filter/title/ \
24 |  -window  $WINDOW \
25 |  -printEvery   100 \
26 |  -encoderModel  "attenbow" \
27 |  -attenPool  5 \
28 | 


--------------------------------------------------------------------------------
/tuning/SDecoder_cfg.txt:
--------------------------------------------------------------------------------
1 | LM 1.0
2 | uni 4.84922778048135
3 | bi 1.2132386742991166
4 | tri -13.382831610766107
5 | ooo -0.5293249226416208
6 | length 0.0
7 | 


--------------------------------------------------------------------------------
/tuning/SDecoder_cmd.tpl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | d = {"src" : ,
 7 |      "model" : ,
 8 |      "title_len" : }
 9 | 
10 | for l in open("SDecoder_cfg.txt"):
11 |     f, val = l.strip().split()
12 |     d[f] = val
13 | 
14 | cmd = "cd $ABS; th $ABS/summary/run.lua -modelFilename {model} " + \
15 |       "-inputf {src} " + \
16 |       "-length {title_len} -blockRepeatWords -recombine " + \
17 |       "-beamSize 50 " + \
18 |       "-lmWeight {LM}  -unigramBonus {uni} -bigramBonus {bi} " + \
19 |       "-trigramBonus {tri} -lengthBonus {length} -unorderBonus {ooo} " + \
20 |       "-nbest > $ABS/tuning/nbest.out"
21 | 
22 | os.system(cmd.format(d))
23 | 


--------------------------------------------------------------------------------
/tuning/SDecoder_test.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright (c) 2015, Facebook, Inc.
 3 | #  All rights reserved.
 4 | #
 5 | #  This source code is licensed under the BSD-style license found in the
 6 | #  LICENSE file in the root directory of this source tree. An additional grant
 7 | #  of patent rights can be found in the PATENTS file in the same directory.
 8 | #
 9 | #  Author: Alexander M Rush <srush@seas.harvard.edu>
10 | #          Sumit Chopra <spchopra@fb.com>
11 | #          Jason Weston <jase@fb.com>
12 | 
13 | import os
14 | import sys
15 | #@lint-avoid-python-3-compatibility-imports
16 | 
17 | d = {"src": sys.argv[1],
18 |      "model": sys.argv[2],
19 |      "title_len": 14}
20 | 
21 | for l in open("tuning/blank.params"):
22 |     f, val = l.strip().split()
23 |     d[f] = val
24 | 
25 | cmd = "cd $ABS; $CUTH $ABS/summary/run.lua -modelFilename {model} " + \
26 |       "-inputf {src} -recombine " + \
27 |       "-length {title_len} -blockRepeatWords " + \
28 |       "-lmWeight {LM} -unigramBonus {uni} -bigramBonus {bi} " + \
29 |       "-trigramBonus {tri} -lengthBonus {length} -unorderBonus {ooo} "
30 | 
31 | os.system(cmd.format(**d))
32 | 


--------------------------------------------------------------------------------
/tuning/ZMERT_cfg.txt:
--------------------------------------------------------------------------------
 1 | ### Commonly used parameters
 2 | -r ref            # target sentences file name (in this case, file name prefix)
 3 | -rps 4                   # references per sentence
 4 | -p params.txt              # parameter file
 5 | -m BLEU 4 closest                   # evaluation metric and its options
 6 | -ipi 20                      # number of intermediate initial points
 7 | -cmd ./SDecoder_cmd.py       # file containing commands to run decoder
 8 | -decOut nbest.out               # file prodcued by decoder
 9 | -dcfg SDecoder_cfg.txt        # decoder config file
10 | -N 500                         # size of N-best list generated each iteration
11 | -v 1                           # verbosity level (0-2; higher value =>
12 | -seed   12341234                # random number generator seed
13 | 


--------------------------------------------------------------------------------
/tuning/params.txt:
--------------------------------------------------------------------------------
1 | LM  ||| 1.0 Fix 0.0 +Inf -1 +1
2 | uni ||| 0.0 Opt -Inf +Inf -1 +1
3 | bi  ||| 0.0 Opt -Inf +Inf -1 +1
4 | tri ||| 0.0 Opt -Inf +Inf -1 +1
5 | ooo ||| 0.0 Opt -Inf 0 -1 +1
6 | length ||| 0.0 Fix -Inf +Inf -1 +1
7 | normalization = none
8 | 


--------------------------------------------------------------------------------