├── sumpy
├── data
│ ├── smart_common_words.txt.gz
│ ├── mead_example_docs
│ │ ├── 41.docsent
│ │ ├── 87.docsent
│ │ └── 81.docsent
│ ├── duc07_task2.json
│ ├── duc03_task2.json
│ └── duc04_task2.json
├── simple.py
├── system
│ ├── __init__.py
│ ├── _graph.py
│ ├── _submodular.py
│ ├── _baseline.py
│ └── _base.py
├── annotators
│ ├── __init__.py
│ ├── _annotator_base.py
│ ├── _preprocessor.py
│ ├── _submodular.py
│ └── _feature_extractors.py
├── document.py
├── io.py
├── eval.py
├── preprocessor.py
├── __init__.py
└── util.py
├── NOTICE
├── .gitignore
├── setup.py
├── README.md
├── duc_testbed.py
└── LICENSE
/sumpy/data/smart_common_words.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kedz/sumpy/HEAD/sumpy/data/smart_common_words.txt.gz
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Sumpy: a multidocument summarization library for python.
2 | Copyright 2015 Chris Kedzie
3 |
4 | This product includes software developed at
5 | Columbia University.
6 |
--------------------------------------------------------------------------------
/sumpy/simple.py:
--------------------------------------------------------------------------------
1 | import sumpy
2 |
3 | def lede(inputs):
4 | s = sumpy.system.LedeSummarizer()
5 | return s.summarize(inputs)
6 |
7 | def centroid(inputs):
8 | s = sumpy.system.CentroidSummarizer()
9 | return s.summarize(inputs)
10 |
11 | def mmr(inputs):
12 | s = sumpy.system.MMRSummarizer()
13 | return s.summarize(inputs)
14 |
15 | def textrank(inputs):
16 | s = sumpy.system.TextRankSummarizer()
17 | return s.summarize(inputs)
18 |
19 | def lexrank(inputs):
20 | s = sumpy.system.LexRankSummarizer()
21 | return s.summarize(inputs)
22 |
--------------------------------------------------------------------------------
/sumpy/system/__init__.py:
--------------------------------------------------------------------------------
1 | from sumpy.system._base import AverageFeatureRankerBase
2 | from sumpy.system._baseline import (LedeSummarizer, CentroidSummarizer,
3 | MMRSummarizer)
4 | from sumpy.system._graph import TextRankSummarizer, LexRankSummarizer
5 | from sumpy.system._submodular import MonotoneSubmodularBasic, SubmodularMMRSummarizer
6 |
7 | __all__ = ["LedeSummarizer", "CentroidSummarizer", "MMRSummarizer",
8 | "TextRankSummarizer", "LexRankSummarizer",
9 | "MonotoneSubmodularBasic", "SubmodularMMRSummarizer, AverageFeatureRankerBase"]
10 |
--------------------------------------------------------------------------------
/sumpy/annotators/__init__.py:
--------------------------------------------------------------------------------
1 | from sumpy.annotators._preprocessor import (SentenceTokenizerMixin,
2 | WordTokenizerMixin, RawBOWMixin, BinaryBOWMixin, TfIdfMixin,
3 | TfIdfCosineSimilarityMixin)
4 | from sumpy.annotators._feature_extractors import (LedeMixin, TextRankMixin,
5 | LexRankMixin, CentroidMixin, MMRMixin)
6 | from sumpy.annotators._submodular import MonotoneSubmodularMixin, SubmodularMMRMixin
7 |
8 |
9 | __all__ = ['SentenceTokenizerMixin', 'WordTokenizerMixin', 'RawBOWMixin',
10 | 'BinaryBOWMixin', 'TfIdfMixin', 'TfIdfCosineSimilarityMixin',
11 | 'LedeMixin', 'TextRankMixin', 'LexRankMixin', 'CentroidMixin',
12 | 'MMRMixin', 'MonotoneSubmodularMixin']
13 |
--------------------------------------------------------------------------------
/sumpy/annotators/_annotator_base.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta, abstractmethod
2 |
3 | class _AnnotatorBase(object):
4 | __metaclass__ = ABCMeta
5 |
6 | @abstractmethod
7 | def requires(self):
8 | pass
9 |
10 | @abstractmethod
11 | def ndarray_requires(self):
12 | pass
13 |
14 | @abstractmethod
15 | def returns(self):
16 | pass
17 |
18 | @abstractmethod
19 | def ndarray_returns(self):
20 | pass
21 |
22 | @abstractmethod
23 | def name(self):
24 | pass
25 |
26 | @abstractmethod
27 | def build(self):
28 | pass
29 |
30 | @abstractmethod
31 | def process(self, input_df, ndarray_data):
32 | pass
33 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # PyInstaller
26 | # Usually these files are written by a python script from a template
27 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 |
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 |
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 |
43 | # Translations
44 | *.mo
45 | *.pot
46 |
47 | # Django stuff:
48 | *.log
49 |
50 | # Sphinx documentation
51 | docs/_build/
52 |
53 | # PyBuilder
54 | target/
55 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | import os
3 | import sys
4 |
5 | data_dir = os.path.join(sys.prefix, "data")
6 | setup(
7 | name = 'sumpy',
8 | packages = ['sumpy', 'sumpy.system', 'sumpy.annotators'],
9 | version = '0.0.1',
10 | description = 'SUMPY: an automatic text summarization library',
11 | author='Chris Kedzie',
12 | author_email='kedzie@cs.columbia.edu',
13 | url='https://github.com/kedz/sumpy',
14 | install_requires=[
15 | 'nltk', 'numpy', 'scipy', 'scikit-learn', 'pandas',
16 | 'networkx',
17 | ],
18 | include_package_data=True,
19 | package_data={
20 | 'sumpy': [os.path.join(data_dir, 'smart_common_words.txt.gz'),
21 | os.path.join(data_dir, 'mead_example_docs', '41.docsent'),
22 | os.path.join(data_dir, 'mead_example_docs', '81.docsent'),
23 | os.path.join(data_dir, 'mead_example_docs', '87.docsent'),
24 | ]},
25 |
26 | )
27 |
28 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sumpy
2 | SUMPY: a python automatic text summarization library
3 |
4 | We currently have several baseline summarizers implemented:
5 |
6 | [x] lede
7 |
8 | [x] TextRank
9 |
10 | [x] LexRank
11 |
12 | [x] Centroid
13 |
14 | [x] ROUGE ngram evaluation
15 |
16 | and have plans to implement many more.
17 |
18 | TODO:
19 |
20 | [ ] ROUGE skip-gram and LCS evaluation
21 |
22 | [ ] FreqSum/SumBasic
23 |
24 | [ ] Submodular optimization based summarizers
25 |
26 | [ ] lda/distributes sentence representation based summarizer
27 |
28 | [ ] DEMS
29 |
30 | [ ] ILP based summarizers
31 |
32 | [ ] collect topic signatures/important word lists
33 |
34 | SUMPY contains several ready to use summarizers with
35 | sensible defaults. Here is a simple example to get you started:
36 |
37 | import sumpy
38 |
39 | doc1 = "This is the text for document1. It is for explanatory purposes..."
40 | doc2 = "This is another document text..."
41 | doc3 = "And yet another document..."
42 |
43 | inputs = [doc1, doc2, doc3]
44 |
45 | print "lede summarizer:"
46 | print sumpy.lede(inputs)
47 |
48 | print "\ntextrank summarizer:"
49 | print sumpy.textrank(inputs)
50 |
51 | print "\ncentroid summarizer:"
52 | print sumpy.centroid(inputs)
53 |
--------------------------------------------------------------------------------
/sumpy/system/_graph.py:
--------------------------------------------------------------------------------
1 | from sumpy.system._base import _SystemBase
2 | from sumpy.annotators import TextRankMixin, LexRankMixin
3 | from sumpy.document import Summary
4 |
5 | class TextRankSummarizer(TextRankMixin, _SystemBase):
6 |
7 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None,
8 | directed=u"undirected", d=.85, tol=.0001, max_iters=20,
9 | verbose=False):
10 | self._sentence_tokenizer = sentence_tokenizer
11 | self._word_tokenizer = word_tokenizer
12 | self.directed = directed
13 | self.d = d
14 | self.tol = tol
15 | self.max_iters = max_iters
16 | super(TextRankSummarizer, self).__init__(verbose=verbose)
17 |
18 | def build_summary(self, input_df, ndarray_data):
19 | output_df = input_df.sort(["f:textrank"], ascending=False)
20 | return Summary(output_df)
21 |
22 | class LexRankSummarizer(LexRankMixin, _SystemBase):
23 |
24 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None,
25 | d=.85, tol=.0001, max_iters=20,
26 | verbose=False):
27 | self._sentence_tokenizer = sentence_tokenizer
28 | self._word_tokenizer = word_tokenizer
29 | self.d = d
30 | self.tol = tol
31 | self.max_iters = max_iters
32 | super(LexRankSummarizer, self).__init__(verbose=verbose)
33 |
34 | def build_summary(self, input_df, ndarray_data):
35 | output_df = input_df.sort(["f:lexrank"], ascending=False)
36 | return Summary(output_df)
37 |
38 |
--------------------------------------------------------------------------------
/sumpy/system/_submodular.py:
--------------------------------------------------------------------------------
1 | from sumpy.system._base import _SystemBase
2 | from sumpy.annotators import MonotoneSubmodularMixin, SubmodularMMRMixin
3 | from sumpy.document import Summary
4 |
5 | class SubmodularMMRSummarizer(SubmodularMMRMixin, _SystemBase):
6 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None,
7 | lam=.3, budget_type="word", budget_size=400, scale=.2,
8 | verbose=False):
9 | self.sentence_tokenizer = sentence_tokenizer
10 | self.word_tokenizer = word_tokenizer
11 | self.lam = lam
12 | self.scale = scale
13 | self.budget_type = budget_type
14 | self.budget_size = budget_size
15 |
16 | super(SubmodularMMRSummarizer, self).__init__(verbose=verbose)
17 |
18 | def build_summary(self, input_df, ndarray_data):
19 | output_df = input_df[input_df["f:submodular-mmr"].isnull() == False]
20 | output_df = output_df.sort(["doc id", "sent id"], ascending=True)
21 | print output_df
22 | print output_df['sent text'].apply(len)
23 | return Summary(output_df)
24 |
25 |
26 | class MonotoneSubmodularBasic(MonotoneSubmodularMixin, _SystemBase):
27 |
28 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None,
29 | k=5, f_of_A=None, verbose=False):
30 |
31 | self.sentence_tokenizer = sentence_tokenizer
32 | self.word_tokenizer = word_tokenizer
33 | self.k = k
34 | self.f_of_A = f_of_A
35 | super(MonotoneSubmodularBasic, self).__init__(verbose=verbose)
36 |
37 | def build_summary(self, input_df, ndarray_data):
38 | output_df = input_df[input_df["f:monotone-submod"] == 1]
39 | output_df = output_df.sort(["doc id", "sent id"], ascending=True)
40 | return Summary(output_df)
41 |
42 |
--------------------------------------------------------------------------------
/sumpy/system/_baseline.py:
--------------------------------------------------------------------------------
1 | from sumpy.system._base import _SystemBase
2 | from sumpy.annotators import (WordTokenizerMixin, LedeMixin, MMRMixin,
3 | CentroidMixin)
4 | from sumpy.document import Summary
5 |
6 | class LedeSummarizer(WordTokenizerMixin, LedeMixin, _SystemBase):
7 |
8 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None,
9 | verbose=False):
10 | self._sentence_tokenizer = sentence_tokenizer
11 | self._word_tokenizer = word_tokenizer
12 | super(LedeSummarizer, self).__init__(verbose=verbose)
13 |
14 | def build_summary(self, input_df, ndarray_data):
15 | output_df = input_df[input_df[u"f:lede"] == 1].sort(
16 | ["doc id"], ascending=True)
17 | return Summary(output_df)
18 |
19 | class CentroidSummarizer(CentroidMixin, _SystemBase):
20 |
21 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None,
22 | verbose=False):
23 | self._sentence_tokenizer = sentence_tokenizer
24 | self._word_tokenizer = word_tokenizer
25 | super(CentroidSummarizer, self).__init__(verbose=verbose)
26 |
27 | def build_summary(self, input_df, ndarray_data):
28 | output_df = input_df.sort(["f:centroid"], ascending=False)
29 | return Summary(output_df)
30 |
31 | class MMRSummarizer(MMRMixin, _SystemBase):
32 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None,
33 | lam=.4, verbose=False):
34 | self._sentence_tokenizer = sentence_tokenizer
35 | self._word_tokenizer = word_tokenizer
36 | self.lam = lam
37 | super(MMRSummarizer, self).__init__(verbose=verbose)
38 |
39 | def build_summary(self, input_df, ndarray_data):
40 | output_df = input_df.sort(["f:mmr"], ascending=False)
41 | return Summary(output_df)
42 |
--------------------------------------------------------------------------------
/sumpy/document.py:
--------------------------------------------------------------------------------
1 | import re
2 | import textwrap
3 |
4 | class Summary(object):
5 | def __init__(self, df):
6 | self._df = df
7 |
8 | def budget(self, type="byte", size=600):
9 | summary = []
10 | if size == "all":
11 | summary = self._df["sent text"].tolist()
12 | elif type == "word":
13 | remaining = size
14 | for idx, sent in self._df.iterrows():
15 | num_words = min(len(sent["words"]), remaining)
16 | summary.append(u" ".join(sent["words"][0 : num_words]))
17 | remaining -= num_words
18 | if remaining < 1:
19 | break
20 | elif type == "byte":
21 | remaining = size
22 | for idx, sent in self._df.iterrows():
23 | num_chars = min(len(sent["sent text"]), remaining)
24 | print num_chars
25 | summary.append(sent["sent text"][0 : num_chars])
26 | remaining -= num_chars
27 | if remaining < 1:
28 | break
29 | return u"\n".join(textwrap.fill(u"{}) {}".format(i, sent))
30 | for i, sent in enumerate(summary, 1)) + u" ..."
31 |
32 | def __unicode__(self):
33 | return self.budget()
34 |
35 | def __str__(self):
36 | return unicode(self).encode("utf-8")
37 |
38 |
39 | class Document(object):
40 | def __init__(self, name, text):
41 | self.name = name
42 | if isinstance(self.name, str):
43 | self.name = self.name.decode(u"utf-8")
44 | self.text = text
45 | if isinstance(self.text, str):
46 | self.text = self.text.decode(u"utf-8")
47 |
48 | def __str__(self):
49 | return unicode(self).encode(u"utf-8")
50 |
51 | def __unicode__(self):
52 | return self.name + u"\n" + self.text
53 |
54 | class DocSet(object):
55 | def __init__(self, docs):
56 | self.docs = docs
57 |
--------------------------------------------------------------------------------
/duc_testbed.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import pandas as pd
3 | import os
4 | import sumpy
5 | import sumpy.eval
6 |
7 | def load_docsets(duc_dir):
8 |
9 | docset_paths = [os.path.join(duc_dir, fname)
10 | for fname in os.listdir(duc_dir)]
11 | docset_paths = [path for path in docset_paths if os.path.isdir(path)]
12 | docsets = {}
13 | for docset_path in docset_paths:
14 | docset_id, docs, models = load_docset(docset_path)
15 | docsets[docset_id] = {u"docs": docs, u"models": models}
16 | return docsets
17 |
18 | def load_docset(docset_path):
19 | docset_id = os.path.split(docset_path)[1]
20 | docs_path = os.path.join(docset_path, u"docs")
21 | docs = sumpy.io.load_duc_docset(docs_path)
22 | models = []
23 | for fname in os.listdir(docset_path):
24 | if docset_id in fname:
25 | model_paths = [os.path.join(docset_path, fname, length)
26 | for length in [u"200", u"400"]]
27 | model_sums = sumpy.io.load_duc_abstractive_summaries(model_paths)
28 | models.extend(model_sums)
29 | return docset_id, docs, models
30 |
31 |
32 | def generate_summaries(systems, docsets):
33 | rouge = sumpy.eval.ROUGE(max_ngrams=2, limit=100, limit_type=u"word")
34 | results = []
35 | for docset_id in docsets.keys():
36 | #print docset_id
37 | docs = docsets[docset_id][u"docs"]
38 | models = docsets[docset_id][u"models"]
39 | sys_sums = [(system_name, unicode(sum_func(docs)))
40 | for system_name, sum_func in systems]
41 | df = rouge.evaluate(sys_sums, models)
42 | results.append(df)
43 | return pd.concat(results).groupby(level=0).mean()
44 |
45 | def main(duc_dir):
46 | print u"Loading DUC document sets from:", duc_dir
47 | docsets = load_docsets(duc_dir)
48 |
49 | lede = lambda x: sumpy.lede(x)
50 | centroid = lambda x: sumpy.centroid(x)
51 | lexrank = lambda x: sumpy.lexrank(x)
52 | systems = [(u"lede", lede), (u"centroid", centroid),
53 | (u"lexrank", lexrank)]
54 | print generate_summaries(systems, docsets)
55 |
56 | if __name__ == u"__main__":
57 | parser = argparse.ArgumentParser()
58 | parser.add_argument(u"-d", u"--duc-dir", required=True, type=unicode,
59 | help=u"path to DUC document set directory")
60 | args = parser.parse_args()
61 | duc_dir = args.duc_dir
62 | main(duc_dir)
63 |
64 |
--------------------------------------------------------------------------------
/sumpy/data/mead_example_docs/41.docsent:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Egyptians Suffer Second Air Tragedy in a Year
6 |
7 | CAIRO, Egypt -- The crash of a Gulf Air flight that killed 143 people in Bahrain is a disturbing deja vu for Egyptians: It is the second plane crash within a year to devastate this Arab country.
8 | Sixty-three Egyptians were on board the Airbus A320, which crashed into shallow Persian Gulf waters Wednesday night after circling and trying to land in Bahrain.
9 | On Oct. 31, 1999, a plane carrying 217 mostly Egyptian passengers crashed into the Atlantic Ocean off Massachusetts.
10 | The cause has not been determined, providing no closure to the families, whose grief was reopened this month with the release of a factual report by the National Transportation Safety Board.
11 | Walid Mourad, head of the Egyptian Pilots Association and a voice often heard in relation to the EgyptAir investigation, said Wednesday's crash is a tragedy for the Arab people as a whole.
12 | "We are all family and brothers.
13 | We all have something in this," Mourad said.
14 | "But for the Egyptians, this is a double blow.
15 | Two disasters in a row for the Egyptians."
16 | Many of the passengers on the Gulf Air flight were headed for jobs in Bahrain or elsewhere in the Gulf.
17 | Rida Hassan was one of those escaping Egypt's moribund economy for work in the oil-rich Gulf.
18 | Hassan's uncle said he rushed to the Cairo airport after hearing a list of the passengers read on television.
19 | The uncle, who would not give his name, said his nephew had come home to get married and stayed only a month.
20 | Hassan worked in a restaurant in Bahrain, his uncle said before disappearing into a room at the airport set aside for relatives desperate for news.
21 | In the hours just after the crash, relatives at the Cairo airport expressed anger and frustration at Gulf Air for the slow release of information.
22 | Women screamed and men tried vainly to calm them.
23 | "No information is being given to us.
24 | Absolutely nothing," Mohammed Ibrahim el-Naggar said hours after the crash.
25 | "We were told that there were some survivors but no names were given."
26 | El-Naggar said his cousin, her husband who works in Dubai, and their two children aged 2 and 3 were on the downed plane.
27 | Gulf Air said it was sending a special plane to carry 134 relatives to Manama airport later Thursday.
28 | "All necessary measures have been taken to receive the families of the victims," Mohammed al-Sayed Abbas, the Egyptian ambassador to Bahrain, told Egyptian television.
29 | "The embassy staff will be with them step by step until they identify the deceased."
30 | In Bahrain, relatives were beginning the wrenching process of identifying the victims from photographs taken after the bodies were retrieved from the Gulf.
31 | Egypt, which lacks the oil wealth of the Gulf and has an economy struggling to revive from decades of socialist stagnation, has a long tradition of sending workers to the Gulf to fill everything from skilled to menial jobs.
32 | Remittances from citizens working abroad make up Egypt's biggest source of foreign exchange.
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/sumpy/io.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import pandas as pd
4 |
5 | def load_duc_docset(input_source):
6 | docs = DucSgmlReader().read(input_source)
7 | return docs
8 |
9 | def load_duc_abstractive_summaries(input_source):
10 | models = DucAbstractSgmlReader().read(input_source)
11 | return models
12 |
13 | class FileInput(object):
14 |
15 | def gather_paths(self, source):
16 | """Determines the type of source and return an iterator over input
17 | document paths. If source is a str or unicode
18 | object, determine if it is also a directory and return an iterator
19 | for all directory files; otherwise treat as a single document input.
20 | If source is any other iterable, treat as an iterable of file
21 | paths."""
22 |
23 | if isinstance(source, str) or isinstance(source, unicode):
24 | if os.path.isdir(source):
25 | paths = [os.path.join(source, fname)
26 | for fname in os.listdir(source)]
27 | for path in paths:
28 | yield path
29 | else:
30 | yield source
31 |
32 | else:
33 | try:
34 | for path in source:
35 | yield path
36 | except TypeError:
37 | print source, 'is not iterable'
38 |
39 | class DucSgmlReader(FileInput):
40 |
41 | def read(self, input_source):
42 | docs = []
43 | for path in self.gather_paths(input_source):
44 | with open(path, u"r") as f:
45 | sgml = "".join(f.readlines())
46 | m = re.search(r"(.*?)", sgml, flags=re.DOTALL)
47 | if m is None:
48 | raise Exception("TEXT not found in " + path)
49 | text = m.group(1).strip()
50 | text_clean = re.sub(r"<[^>]*?>", r"", text)
51 | docs.append(text_clean)
52 | return docs
53 |
54 | class DucAbstractSgmlReader(FileInput):
55 | def read(self, input_source):
56 | docs = []
57 | for path in self.gather_paths(input_source):
58 | with open(path, u"r") as f:
59 | sgml = "".join(f.readlines())
60 | m = re.search(r"]+>(.*?)", sgml, flags=re.DOTALL)
61 | if m is None:
62 | raise Exception("SUM not found in " + path)
63 | text = m.group(1).strip()
64 | docs.append(text)
65 | return docs
66 |
67 | class MeadDocSentReader(FileInput):
68 | docsent_patt = (r"")
70 | sent_patt = (r"(.*?)")
73 | def read(self, input_source):
74 | docs = []
75 | for path in self.gather_paths(input_source):
76 | sents = []
77 | with open(path, u"r") as f:
78 | xml = "".join(f.readlines())
79 | m = re.search(self.docsent_patt, xml, flags=re.DOTALL)
80 | if m is None:
81 | raise Exception("DOCSENT not found in " + path)
82 | doc_id = m.group(1)
83 | lang = m.group(3)
84 | for s in re.finditer(self.sent_patt, xml, flags=re.DOTALL):
85 | par = int(s.group(1))
86 | rsnt = s.group(2)
87 | sno = s.group(3)
88 | text = s.group(4).strip()
89 | if par > 1:
90 | sents.append(text)
91 | #sents.append({u"doc id": doc_id, u"sent id": int(rsnt),
92 | # u"type": u"body" if par > 1 else u"headline",
93 | # u"text": text.decode("utf-8")})
94 | docs.append("\n".join(sents).decode("utf-8"))
95 | #df = pd.DataFrame(
96 | # sents, columns=[u"doc id", u"type", u"sent id", u"text"])
97 | #df.set_index([u"doc id", u"sent id"], inplace=True)
98 | return docs
99 |
100 | def load_demo_docs():
101 | import pkg_resources
102 | input_source = pkg_resources.resource_filename(
103 | "sumpy",
104 | os.path.join("data", "mead_example_docs"))
105 | return MeadDocSentReader().read(input_source)
106 |
--------------------------------------------------------------------------------
/sumpy/data/mead_example_docs/87.docsent:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Prayers for victims of Bahrain crash
6 |
7 | MANAMA, Bahrain (AP) _ Three bodies wrapped in cloth, one the size of a small child, were lain before the faithful in the Grand Mosque Friday during a special prayer for the dead in honor of the 143 victims of the Gulf Air crash.
8 | Bahrain"s Prime Minister Sheik Khalifa bin Salman Al Khalifa and other top officials stood side-by-side with 2,000 Muslims reciting funeral prayers before the bodies, which were among the 107 adults and 36 children killed in Wednesday"s air disaster, said Information Ministry spokesman Syed el-Bably.
9 | Around the island, weekly Friday prayer services devoted time to funeral prayers for the passengers and crew.
10 | Across the street at the Gulf Hotel, relatives of the victims sought comfort from religious leaders and counselors as they continued the painful process of identifying loved ones from books of photographs of remains.
11 | "It"s very difficult to see the pictures," said Nadr al-Khawaja, a Bahraini whose cousin, her husband and their 9-month-old baby son were killed.
12 | "It"s very hard for the parents _ it"s torture."
13 | Salvage attempts were continuing in the shallow waters at the crash site Friday.
14 | Twenty-six U.S. divers joined Bahraini experts scouring the sandy sea floor in search for more bits of wing and fuselage from Gulf Air flight 072.
15 | At dawn Friday, the divers began searching for "diplomatic cargo" being carried by a U.S. government courier, according to Cdr. Jeff Gradeck, spokesman for the U.S. Navy"s 5th Fleet, which is based in Bahrain.
16 | The State Department has said the courier, 31-year-old Seth Foti, was carrying pouches containing classified information.
17 | By midafternoon, there was no word of their recovery.
18 | The U.S. Embassy in Bahrain was planning a private memorial service Saturday for Foti.
19 | He and his wife of three months, Anisha, met at the embassy, where she had worked briefly last year.
20 | Scraps of metal and other remnants were brought to an airport hangar where aviation experts were reconstructing the Airbus 320 for investigators, said Gulf Air spokesman Stephen Tuckwell.
21 | Both of the plane"s "black boxes" _ the flight data and voice cockpit recorders _ were to be shipped abroad for data recovery but aviation experts had not finalized plans on Friday, Gulf Air said.
22 | Tuckwell said it could take weeks before the data was recovered.
23 | Bahrain"s State television had quoted witnesses soon after the crash who described seeing a fire in one of the aircraft"s engines.
24 | Gulf Air officials said there was no fire and other witnesses have said they did not see flames.
25 | Meanwhile, the U.S. Embassy here said air accident investigators from the National Transportation and Safety Board were en route to Manama on Friday to join Bahraini investigators in determining the cause of the crash.
26 | Six French government experts and a representative of Airbus Industries arrived Thursday evening to look into the crash _ the sixth for an Airbus 320 in the last 12 years.
27 | Flight 072 crashed in shallow water near shore and Ali Ahmedi, a spokesman and an acting vice president for Gulf Air, has said the pilot gave no indication to air traffic controllers that there were any problems in the plane.
28 | Gulf Air said 135 passengers and eight crew members were on board.
29 | Sixty-three passengers were Egyptian, 34 Bahraini, 12 Saudi Arabian, nine Palestinian, six from the United Arab Emirates, three Chinese, two British and one each from Canada, Oman, Kuwait, Sudan, Australia and the United States.
30 | Two crew members were Bahrainis with one each from Oman, the Philippines, Poland, India, Morocco and Egypt.
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/sumpy/eval.py:
--------------------------------------------------------------------------------
1 | from nltk.util import ngrams
2 | from sumpy.preprocessor import (SentenceTokenizerMixin,
3 | ROUGEWordTokenizerMixin, SMARTStopWordsMixin, LengthLimiterMixin)
4 | import pandas as pd
5 |
6 | class ROUGE(SentenceTokenizerMixin, ROUGEWordTokenizerMixin,
7 | SMARTStopWordsMixin, LengthLimiterMixin):
8 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None,
9 | max_ngrams=2, remove_stopwords=False, stopwords=None,
10 | show_per_model_results=False, limit=None, limit_type=None):
11 |
12 | self._sentence_tokenizer = sentence_tokenizer
13 | self._word_tokenizer = word_tokenizer
14 | self._max_ngrams = max_ngrams
15 | self.remove_stopwords = remove_stopwords
16 | self._stopwords = stopwords
17 | self._show_per_model_results = show_per_model_results
18 | self._limit = limit
19 | self._limit_type = limit_type
20 |
21 | def evaluate(self, systems, models):
22 | models = list(models) # make model order consistent
23 | sent_tokenizer = self.build_sent_tokenizer()
24 | word_tokenizer = self.build_word_tokenizer()
25 | length_limiter = self.build_length_limiter()
26 | is_stopword = self.build_stopwords()
27 | results = []
28 | result_index = []
29 | for name, system in systems:
30 | sys_ngram_sets = self.extract_ngrams(
31 | system, sent_tokenizer, word_tokenizer, self._max_ngrams,
32 | is_stopword, length_limiter)
33 |
34 | for model_no, model in enumerate(models, 1):
35 | model_ngram_sets = self.extract_ngrams(
36 | model, sent_tokenizer, word_tokenizer, self._max_ngrams,
37 | is_stopword, length_limiter)
38 | scores = self.compute_prf(
39 | sys_ngram_sets, model_ngram_sets, self._max_ngrams)
40 | result_index.append((name, model_no))
41 | results.append(scores)
42 |
43 | # Collect results as a pandas DataFrame and compute the mean
44 | # performance.
45 | col_index = []
46 | dataframe_cols = []
47 | for i in xrange(1, self._max_ngrams + 1):
48 | rouge_n = u"ROUGE-{}".format(i)
49 | col_index.append((rouge_n, "Recall"))
50 | col_index.append((rouge_n, "Prec."))
51 | col_index.append((rouge_n, "F1"))
52 |
53 | row_index = pd.MultiIndex.from_tuples(
54 | result_index, names=['system', 'model'])
55 | col_index = pd.MultiIndex.from_tuples(col_index)
56 | df = pd.DataFrame(results, columns=col_index, index=row_index)
57 | df2 = df.groupby(level=0).mean()
58 | if self._show_per_model_results is True:
59 | df2['model'] = 'AVG'
60 | df2 = df2.reset_index().set_index(['system','model']).append(df)
61 | df2 = df2.sort()
62 |
63 | return df2
64 |
65 | def extract_ngrams(self, text, sent_tokenizer, word_tokenizer, max_ngrams,
66 | is_stopword, length_limiter):
67 | ngram_sets = {}
68 | sents = sent_tokenizer(text)
69 |
70 | tokens = []
71 | for sent in sents:
72 | tokens.extend([word.lower() for word in word_tokenizer(sent)])
73 |
74 | # Remove stopwords.
75 | tokens = [word for word in tokens if is_stopword(word) is False]
76 | tokens = length_limiter(tokens)
77 |
78 | for i in xrange(1, max_ngrams + 1):
79 | ngram_sets[i] = {}
80 | total = 0
81 | for ngram in ngrams(tokens, i):
82 | ngram_sets[i][ngram] = ngram_sets[i].get(ngram, 0) + 1
83 | total += 1
84 | ngram_sets[i][u"__TOTAL__"] = total
85 | return ngram_sets
86 |
87 | def compute_prf(self, sys_ngram_sets, model_ngram_sets, max_ngrams):
88 | scores = []
89 | for i in xrange(1, max_ngrams + 1):
90 | intersect = 0
91 | for ngram, model_ngram_count in model_ngram_sets[i].items():
92 | if ngram == "__TOTAL__":
93 | continue
94 | sys_ngram_count = sys_ngram_sets[i].get(ngram, 0)
95 | intersect += min(model_ngram_count, sys_ngram_count)
96 | recall = float(intersect) / model_ngram_sets[i][u"__TOTAL__"]
97 | prec = float(intersect) / sys_ngram_sets[i][u"__TOTAL__"]
98 |
99 | if intersect == 0:
100 | print "Warning: 0 {}-gram overlap".format(i)
101 | f1 = 0
102 | else:
103 | f1 = 2 * prec * recall / (prec + recall)
104 | scores.append(recall)
105 | scores.append(prec)
106 | scores.append(f1)
107 |
108 | return scores
109 |
--------------------------------------------------------------------------------
/sumpy/preprocessor.py:
--------------------------------------------------------------------------------
1 | import nltk.data
2 | from nltk.tokenize import WordPunctTokenizer
3 | from sklearn.feature_extraction.text import TfidfVectorizer
4 | import re
5 | import gzip
6 | import pkg_resources
7 | import os
8 |
9 | class SentenceTokenizerMixin(object):
10 | def build_sent_tokenizer(self):
11 | """Return a function that splits a string into a sequence of
12 | sentences."""
13 | if self._sentence_tokenizer is not None:
14 | tok = self._sentence_tokenizer
15 | else:
16 | tok = nltk.data.load('tokenizers/punkt/english.pickle').tokenize
17 | return tok
18 |
19 |
20 | class WordTokenizerMixin(object):
21 | def build_word_tokenizer(self):
22 | """Return a function that splits a string into a sequence of words."""
23 | if self._word_tokenizer is not None:
24 | tokenize = self._word_tokenizer
25 | else:
26 | tokenize = WordPunctTokenizer().tokenize
27 | return tokenize
28 |
29 |
30 | class ROUGEWordTokenizerMixin(object):
31 | def build_word_tokenizer(self):
32 | """This mixin provides the same reg-ex based word tokenizer that is
33 | used in the official ROUGE perl script (Lin, 2004). See the readText
34 | subroutine (line 1816) of ROUGE-1.5.5.pl for reference."""
35 | if self._word_tokenizer is not None:
36 | tokenize = self._word_tokenizer
37 | else:
38 | def rouge_tokenize(sentence):
39 | s = re.sub(r"-", r" -", sentence, flags=re.UNICODE)
40 | s = re.sub(r"[^A-Za-z0-9\-]", r" ", s, flags=re.UNICODE)
41 | s = s.strip()
42 | s = re.sub(r"\s+", r" ", s, flags=re.UNICODE)
43 | return s.split(u" ")
44 | tokenize = rouge_tokenize
45 | return tokenize
46 |
47 | class CorpusTfidfMixin(object):
48 | def build_tfidf_vectorizer(self):
49 | self._tfidf_vectorizer = TfidfVectorizer(analyzer=lambda x: x)
50 | return self._tfidf_vectorizer.fit_transform
51 |
52 | class TextAnalyzerMixin(object):
53 |
54 | def build_analyzer(self):
55 | sent_tokenize = self._build_sent_tokenizer()
56 | word_tokenize = self._build_word_tokenizer()
57 | stem = self._build_stemmer()
58 | def analyzer(text):
59 | sents = sent_tokenize(text)
60 | tokenized_sents = [[stem(word) for word in word_tokenize(sent)]
61 | for sent in sents]
62 | return tokenized_sents, sents
63 | return analyzer
64 |
65 | def _build_sent_tokenizer(self):
66 | """Return a function that splits a string into a sequence of
67 | sentences."""
68 | if self._sentence_tokenizer is not None:
69 | return self._sentence_tokenizer
70 | else:
71 | return nltk.data.load('tokenizers/punkt/english.pickle').tokenize
72 |
73 | def _build_word_tokenizer(self):
74 | """Return a function that splits a string into a sequence of words."""
75 | if self._word_tokenizer is not None:
76 | tokenize = self._word_tokenizer
77 | else:
78 | tokenize = WordPunctTokenizer().tokenize
79 |
80 | return tokenize
81 |
82 | def _build_stemmer(self):
83 | if self._stemmer is not None:
84 | return self._stemmer
85 | else: return lambda w: w
86 |
87 | class SMARTStopWordsMixin(object):
88 | def build_stopwords(self):
89 | if self.remove_stopwords is True:
90 | if self._stopwords is None:
91 | path = pkg_resources.resource_filename(
92 | "sumpy",
93 | os.path.join("data", "smart_common_words.txt.gz"))
94 | with gzip.open(path, u"r") as f:
95 | self._stopwords = set(
96 | [word.strip().decode(u"utf-8").lower()
97 | for word in f.readlines()])
98 | return lambda word: word in self._stopwords
99 | else:
100 | return lambda word: False
101 |
102 | class LengthLimiterMixin(object):
103 | def build_length_limiter(self):
104 | """
105 | Return a function that shortens a list of tokens to a
106 | desired length.
107 | """
108 | if self._limit is None and self._limit_type is not None:
109 | raise Exception("Both limit and limit_type must be set.")
110 | if self._limit is not None and self._limit_type is None:
111 | raise Exception("Both limit and limit_type must be set.")
112 | if self._limit_type not in [None, u"word"]:
113 | raise Exception(
114 | "limit_type: {} not implemented.".format(self._limit_type))
115 |
116 | if self._limit_type is None:
117 | # Do not shorten, just return tokens unchanged.
118 | return lambda x: x
119 | if self._limit_type == u"word":
120 | # Shorten list to be `_limit` tokens long.
121 | def word_limiter(sequence):
122 | if len(sequence) < self._limit:
123 | print "Warning: document is shorter than the max length" \
124 | + " limit. This can effect evaluation negatively."
125 | return sequence[:self._limit]
126 | return word_limiter
127 |
--------------------------------------------------------------------------------
/sumpy/annotators/_preprocessor.py:
--------------------------------------------------------------------------------
1 | from sumpy.annotators._annotator_base import _AnnotatorBase
2 | import pkg_resources
3 | import gzip
4 | import os
5 | import pandas as pd
6 | import nltk
7 | from nltk.tokenize import WordPunctTokenizer
8 | from sklearn.feature_extraction.text import TfidfTransformer
9 | from sklearn.feature_extraction.text import CountVectorizer
10 | from sklearn.metrics.pairwise import cosine_similarity
11 | import re
12 |
13 |
14 | class SentenceTokenizerMixin(_AnnotatorBase):
15 | """
16 | Analyze method takes a string (an article text usually) and splits it
17 | into substrings corresponding to the sentences in the origial article.
18 | """
19 |
20 | def requires(self):
21 | return ["doc text"]
22 |
23 | def ndarray_requires(self):
24 | return []
25 |
26 | def returns(self):
27 | return ["sent id", "sent text"]
28 |
29 | def ndarray_returns(self):
30 | return []
31 |
32 | def name(self):
33 | return "SentenceTokenizerMixin"
34 |
35 | def build(self):
36 |
37 | if not hasattr(self, "_sentence_tokenizer"):
38 | self._sentence_tokenizer = None
39 |
40 | if self._sentence_tokenizer is None:
41 | dl = nltk.downloader.Downloader()
42 | if dl.is_installed("punkt") is False:
43 | print "Installing NLTK Punkt Sentence Tokenizer"
44 | dl.download("punkt")
45 |
46 | self._sentence_tokenizer = nltk.data.load(
47 | 'tokenizers/punkt/english.pickle').tokenize
48 |
49 | def process(self, input_df, ndarray_data):
50 | def split_text(group):
51 | row = group.irow(0)
52 | sents = self._sentence_tokenizer(row["doc text"])
53 | return pd.DataFrame([{"doc id": row["doc id"],
54 | "sent id": i, "sent text": sent}
55 | for i, sent in enumerate(sents, 1)])
56 |
57 | processed_df = input_df.groupby(
58 | "doc id", group_keys=False).apply(split_text)
59 |
60 | cols = input_df.columns.difference(processed_df.columns).tolist()
61 | cols += ["doc id"]
62 | output_df = input_df[cols].merge(
63 | processed_df, on="doc id", how="inner")
64 | return output_df, ndarray_data
65 |
66 | class WordTokenizerMixin(SentenceTokenizerMixin):
67 | """Analyze method takes a string (corresponding to a sentence) and splits
68 | it into substrings corresponding to the words in original aritcle."""
69 |
70 | def build(self):
71 |
72 | if not hasattr(self, "_word_tokenizer"):
73 | self._word_tokenizer = None
74 |
75 | if self._word_tokenizer is None:
76 | self._word_tokenizer = WordPunctTokenizer().tokenize
77 |
78 | def process(self, input_df, ndarray_data):
79 | input_df["words"] = input_df["sent text"].apply(
80 | self._word_tokenizer)
81 | return input_df, ndarray_data
82 |
83 | def requires(self):
84 | return ["sent id", "sent text"]
85 |
86 | def ndarray_requires(self):
87 | return []
88 |
89 | def returns(self):
90 | return ["words"]
91 |
92 | def ndarray_returns(self):
93 | return []
94 |
95 | def name(self):
96 | return "WordTokenizerMixin"
97 |
98 | class RawBOWMixin(WordTokenizerMixin):
99 |
100 | def build(self):
101 |
102 | if not hasattr(self, "_count_vectorizer"):
103 | self._count_vectorizer = None
104 |
105 | if self._count_vectorizer is None:
106 | self._count_vectorizer = CountVectorizer(
107 | input=u"content", preprocessor=lambda x: x,
108 | tokenizer=lambda x: x)
109 |
110 | def process(self, input_df, ndarray_data):
111 | ndarray_data["RawBOWMatrix"] = self._count_vectorizer.fit_transform(
112 | input_df["words"].tolist())
113 | return input_df, ndarray_data
114 |
115 | def requires(self):
116 | return ["words"]
117 |
118 | def returns(self):
119 | return []
120 |
121 | def ndarray_requires(self):
122 | return []
123 |
124 | def ndarray_returns(self):
125 | return ["RawBOWMatrix"]
126 |
127 | def name(self):
128 | return "RawBOWMixin"
129 |
130 | class BinaryBOWMixin(RawBOWMixin):
131 |
132 | def build(self):
133 | pass
134 |
135 | def process(self, input_df, ndarray_data):
136 | X = ndarray_data["RawBOWMatrix"].copy()
137 | X[X > 0] = 1
138 | ndarray_data["BinaryBOWMatrix"] = X
139 | return input_df, ndarray_data
140 |
141 | def requires(self):
142 | return []
143 |
144 | def returns(self):
145 | return []
146 |
147 | def ndarray_requires(self):
148 | return ["RawBOWMatrix",]
149 |
150 | def ndarray_returns(self):
151 | return ["BinaryBOWMatrix"]
152 |
153 | def name(self):
154 | return "BinaryBOWMixin"
155 |
156 | class TfIdfMixin(RawBOWMixin):
157 | def build(self):
158 | if not hasattr(self, "_tfidf_transformer"):
159 | self._tfidf_transformer = None
160 |
161 | if self._tfidf_transformer is None:
162 | self._tfidf_transformer = TfidfTransformer()
163 | #input=u"content", preprocessor=lambda x: x,
164 | #tokenizer=lambda x: x)
165 |
166 | def process(self, input_df, ndarray_data):
167 | X = self._tfidf_transformer.fit_transform(
168 | ndarray_data["RawBOWMatrix"])
169 | ndarray_data["TfIdfMatrix"] = X
170 | return input_df, ndarray_data
171 |
172 | def requires(self):
173 | return []
174 |
175 | def returns(self):
176 | return []
177 |
178 | def ndarray_requires(self):
179 | return ["RawBOWMatrix",]
180 |
181 | def ndarray_returns(self):
182 | return ["TfIdfMatrix"]
183 |
184 | def name(self):
185 | return "TfIdfMixin"
186 |
187 | class TfIdfCosineSimilarityMixin(TfIdfMixin):
188 |
189 | def build(self):
190 | pass
191 |
192 | def process(self, input_df, ndarray_data):
193 | K = cosine_similarity(ndarray_data["TfIdfMatrix"])
194 | ndarray_data["TfIdfCosSimMatrix"] = K
195 | return input_df, ndarray_data
196 |
197 | def requires(self):
198 | return []
199 |
200 | def returns(self):
201 | return []
202 |
203 | def ndarray_requires(self):
204 | return ["TfIdfMatrix"]
205 |
206 | def ndarray_returns(self):
207 | return ["TfIdfCosSimMatrix"]
208 |
209 | def name(self):
210 | return "TfIdfCosineSimilarityMixin"
211 |
212 |
213 |
--------------------------------------------------------------------------------
/sumpy/data/mead_example_docs/81.docsent:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | One American among 143 dead in crash
6 |
7 | MINA SALMAN PORT, Bahrain (AP) -- A man's black shoe, a plastic sandal and bits of yellow foam padding bobbed Thursday in the waters off this tiny island nation, where families were burying loved ones a day after Gulf Air Flight 072 crashed, killing all 143 aboard.
8 | Bahraini authorities and U.S. Navy divers based in the Gulf recovered both ''black boxes'' - the flight data and voice cockpit recorders - near where the plane slammed into shallow water off Bahrain's shore.
9 | Neither box appeared damaged, according to Bahrain civil defense chief James Windsor, who received the voice cockpit recorder Thursday from U.S. Navy divers who brought it to shore.
10 | Authorities were awaiting the arrival of experts from the U.S. National Transportation Safety Board for help with the Bahraini-led investigation.
11 | Six French government experts and an Airbus Industries representative flew in Thursday evening.
12 | Ali Ahmedi, a spokesman and an acting vice president for Gulf Air, said it was too early to speculate on what caused the plane to crash as it circled the airport before coming in to land.
13 | But he said there was no indication the pilot was anticipating an emergency landing.
14 | ''The pilot did not make any kind of statements of problems in the plane,'' Ahmedi said.
15 | Transportation Minister Sheik Ali bin Khalifa Al Khalifa said he was hopeful the black boxes would provide some clues.
16 | ''Any news, anything out of it would be a help,'' he said.
17 | Under the best of circumstances, a water landing is risky, said Michael Barr, director of the aviation program at the University of Southern California.
18 | Even a pilot coming in relatively slowly onto the water, hoping to skip across its surface like a stone tossed by a child, could clip a wing and lose control, he said.
19 | And the depth of the water would make little difference to the landing, experts said: A large airplane that crashes at high speed is going to be destroyed, whatever it hits.
20 | Evidence of that destruction lay off Bahrain on Thursday.
21 | In waters often less than 10 feet deep, shadowy bits of wing and fuselage, mostly in small pieces, were resting on the sandy sea floor.
22 | A few recognizable pieces of the Gulf Air Airbus 320 protruded from the water: a ripped tail wing with the airline's black, red and gold logo, skin of the fuselage with the letters 'LF AIR' above the surface.
23 | Most traces of the 143 victims were collected in the hours after the Cairo-to-Bahrain flight crashed Wednesday evening.
24 | Luggage and clothing that floated to the surface were removed so they wouldn't be swept away with the tides.
25 | Like the plane, many of the bodies were shattered, and relatives struggled to identify loved ones so they could claim their remains for burial.
26 | At a hotel in the capital, relatives sobbed as a Gulf Air official, his voice choking, read out names of their loved ones listed as victims.
27 | Family members were asked to make identifications from photos taken after the bodies were recovered.
28 | ''This is the worst day of my life.
29 | I lost a part of me,'' said Khalifa al-Hashil, 45, of Saudi Arabia.
30 | His 35-year-old brother, Mohammed, died in the crash.
31 | Fifteen victims were buried Thursday at Manama Cemetery, the country's largest.
32 | Mohammed Jassim, 45, an undertaker at the cemetery, washed disfigured faces and mutilated bodies with rose water before the remains - still in body bags tagged at a makeshift morgue - were placed in freshly dug graves.
33 | ''It's a painful sight,'' he said.
34 | ''I've handled dead bodies before, but none so dreadful to look at.''
35 | In 15-minute intervals, white Health Ministry vans pulled up at the cemetery to unload victims in tagged body bags.
36 | Chants of ''God is Great'' and mournful wails wafted over the cemetery during the burial.
37 | Relatives offered prayers for the dead, standing side by side, while others wept on each other's shoulders as clerics tried to comfort them.
38 | Thirty-six of the 143 victims were children, officials said.
39 | All appeared to have been traveling with their families.
40 | Many families in the region are ending vacations at this time of year, which could account for the large number of children aboard.
41 | Amjad Obaid, a physician, was burying his sister-in-law, 4-year-old niece and 10-year-old nephew.
42 | He said a disaster alert on his pager had summoned him to work.
43 | ''Only when I got to the hospital I found out that this plane carried my brother's wife and her children,'' Obaid said.
44 | They had been returning from a vacation in Egypt.
45 | After the crash, U.S. Navy helicopters, small boats and an oceangoing tug quickly joined the nighttime search and rescue effort a few miles off the northern coast of Bahrain.
46 | The island is the headquarters of the U.S. Navy's 5th Fleet.
47 | Bahraini Crown Prince Sheik Salman bin Hamad Al Khalifa personally directed the effort, the U.S. military said.
48 | Gulf Air said 135 passengers and eight crew members were on board.
49 | They included 64 Egyptians, 36 Bahrainis, 12 Saudi Arabians, nine Palestinians, six from the United Arab Emirates, three Chinese, two British and one each from the United States, Canada, Oman, Kuwait, Sudan, Australia, Oman, the Philippines, Poland, India and Morocco.
50 | The American killed in the crash was 31-year-old Seth J. Foti, a diplomatic courier carrying classified information in yellow pouches, the State Department said.
51 | Foti had joined the service 14 months ago, spokesman Richard Boucher said.
52 | He said he did not know what Foti had with him when the plane went down.
53 | ''His dedication to the mission of the courier service was unmatched, and he was clearly an asset to the Department of State and the U.S. government,'' Boucher said.
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/sumpy/system/_base.py:
--------------------------------------------------------------------------------
1 | from sumpy.annotators._annotator_base import _AnnotatorBase
2 | from sumpy.annotators import SentenceTokenizerMixin, WordTokenizerMixin
3 | from sumpy.document import Summary
4 | from abc import ABCMeta, abstractmethod
5 | import pandas as pd
6 | import numpy as np
7 | import networkx as nx
8 |
9 | class _SystemBase(object):
10 | """Abstract base class for summarizer systems."""
11 |
12 | __metaclass__ = ABCMeta
13 |
14 | def __init__(self, verbose=False):
15 | self.verbose = verbose
16 | self._dependency_graph = None
17 | self._annotators = None
18 | self._pipeline = None
19 |
20 | @abstractmethod
21 | def build_summary(self, input_df, ndarray_data):
22 | pass
23 |
24 | def summarize(self, inputs):
25 |
26 | if not hasattr(self, "_pipeline") or self._pipeline is None:
27 | self.build_pipeline()
28 |
29 | input_df, ndarray_data = self.prepare_inputs(inputs)
30 | processed_df, processed_ndarray_data = self.process_input(
31 | input_df, ndarray_data)
32 |
33 | return self.build_summary(processed_df, processed_ndarray_data)
34 |
35 | def build_pipeline(self):
36 | self.build_dependency_graph()
37 | self._pipeline = []
38 | for node in nx.topological_sort(self._dependency_graph):
39 | if node in self._annotators:
40 | self._pipeline.append(self._annotators[node])
41 | if self.verbose:
42 | print "{} ({}) build".format(self.__class__.__name__,
43 | self._annotators[node].name(self))
44 | self._annotators[node].build(self)
45 |
46 | def prepare_inputs(self, inputs, ndarray_data=None):
47 |
48 | requires = set()
49 | returns = set()
50 | ndarray_requires = set()
51 | ndarray_returns = set()
52 |
53 | for ann in self._pipeline:
54 | requires.update(ann.requires(self))
55 | returns.update(ann.returns(self))
56 | ndarray_requires.update(ann.ndarray_requires(self))
57 | ndarray_returns.update(ann.ndarray_returns(self))
58 |
59 | # Allocate keys for ndarray dependencies.
60 | if ndarray_data is None:
61 | ndarray_data = {}
62 | for key in ndarray_requires.union(ndarray_returns):
63 | if key not in ndarray_data:
64 | ndarray_data[key] = None
65 |
66 | # Allocate columns for dataframe data dependencies.
67 | all_cols = list(requires.union(returns))
68 | if isinstance(inputs, list) or isinstance(inputs, tuple):
69 | df = pd.DataFrame([{"doc id": doc_id, "doc text": doc_text}
70 | for doc_id, doc_text in enumerate(inputs)],
71 | columns=["doc id"] + all_cols)
72 | return df, ndarray_data
73 |
74 | elif isinstance(inputs, pd.DataFrame):
75 | if "doc id" not in inputs:
76 | raise Exception("input DataFrame must have column 'doc id'")
77 |
78 | cols = list(set(inputs.columns.tolist() + all_cols))
79 | df = pd.DataFrame(inputs.to_dict(), columns=cols)
80 | df.reset_index(inplace=True)
81 | return df, ndarray_data
82 | else:
83 | raise Exception("Bad input: list of strings or dataframe only.")
84 |
85 | def process_input(self, input_df, ndarray_data):
86 | cols = set(input_df.columns.tolist())
87 | for ann in self._pipeline:
88 |
89 | for rtype in ann.returns(self):
90 | assert rtype in cols
91 |
92 | for req in ann.requires(self):
93 | assert req in cols
94 |
95 | run_stage = input_df[ann.returns(self)].isnull().any().any() \
96 | or np.any([ndarray_data[rtype] is None
97 | for rtype in ann.ndarray_returns(self)])
98 |
99 | if run_stage:
100 |
101 | if self.verbose:
102 | print "{} ({}) process".format(
103 | self.__class__.__name__, ann.name(self))
104 |
105 | input_df, ndarray_data = ann.process(
106 | self, input_df, ndarray_data)
107 |
108 | return input_df, ndarray_data
109 |
110 | def build_dependency_graph(self):
111 | G = nx.DiGraph()
112 | self._annotators = {}
113 |
114 | def check_mixins(clazz, visited=set()):
115 | if not issubclass(clazz, _SystemBase):
116 | if issubclass(clazz, _AnnotatorBase):
117 | name = clazz.name(self)
118 | self._annotators[name] = clazz
119 | for req in clazz.requires(self):
120 | G.add_edge(req, name)
121 | for req in clazz.ndarray_requires(self):
122 | G.add_edge(req, name)
123 |
124 | for rtype in clazz.returns(self):
125 | G.add_edge(name, rtype)
126 | for rtype in clazz.ndarray_returns(self):
127 | G.add_edge(name, rtype)
128 |
129 | visited.add(clazz)
130 | for base in clazz.__bases__:
131 | if base in visited:
132 | continue
133 | if not issubclass(base, _AnnotatorBase):
134 | continue
135 | if base == _AnnotatorBase:
136 | continue
137 | check_mixins(base, visited)
138 |
139 | check_mixins(self.__class__)
140 | self._dependency_graph = G
141 |
142 | def print_dependency_graph(self, filename=None, to_iPython=True):
143 | import pygraphviz as pgv
144 | if not hasattr(self, "_dependency_graph") or \
145 | self._dependency_graph is None:
146 | self.build_dependency_graph()
147 |
148 | if filename is None:
149 | filename = "sumpy.tmp.png"
150 |
151 | G = pgv.AGraph(strict=False, directed=True)
152 | for node in self._dependency_graph:
153 | if node in self._annotators:
154 | G.add_node(node)
155 | G.get_node(node).attr["shape"] ="rectangle"
156 | elif node.startswith("f:"):
157 | G.add_node(node)
158 | G.get_node(node).attr["shape"] ="parallelogram"
159 | for edge in self._dependency_graph.in_edges(node):
160 | G.add_edge(edge[0], edge[1], color="green")
161 | else:
162 | for in_edge in self._dependency_graph.in_edges(node):
163 | for out_edge in self._dependency_graph.out_edges(node):
164 | G.add_edge(in_edge[0], out_edge[1],
165 | label=node, key=node)
166 |
167 | G.layout("dot")
168 | G.draw(filename)
169 | if to_iPython is True:
170 | from IPython.display import Image
171 | return Image(filename=filename)
172 |
173 | class AverageFeatureRankerBase(
174 | WordTokenizerMixin, _SystemBase):
175 |
176 | def build_summary(self, input_df, ndarray_data):
177 | cols = [f for f in input_df.columns.tolist() if f.startswith("f:")]
178 | X = input_df[cols].values
179 | input_df["rank"] = (X / X.max(axis=0)).mean(axis=1)
180 | output_df = input_df.sort(["rank"], ascending=False)
181 | return Summary(output_df)
182 |
--------------------------------------------------------------------------------
/sumpy/annotators/_submodular.py:
--------------------------------------------------------------------------------
1 | from sumpy.annotators import WordTokenizerMixin, TfIdfCosineSimilarityMixin
2 | import numpy as np
3 |
4 |
5 | class SubmodularMMRMixin(TfIdfCosineSimilarityMixin):
6 |
7 | def build(self):
8 | if not hasattr(self, "lam"):
9 | self.lam = .3
10 | assert 0 <= self.lam
11 |
12 | if not hasattr(self, "scale"):
13 | self.scale = 1.0
14 | assert 0 <= self.scale
15 |
16 | if not hasattr(self, "budget_type"):
17 | self.budget_type = "word"
18 | assert self.budget_type in ["word", "byte"]
19 |
20 | if not hasattr(self, "budget_size"):
21 | self.budget_size = 400
22 | assert 0 < self.budget_size
23 |
24 | def rank(input_df, ndarray_data):
25 | if self.budget_type == "word":
26 | B = np.array(ndarray_data["RawBOWMatrix"].sum(axis=1))
27 | print type(B)
28 | elif self.budget_type == "byte":
29 | B = input_df["sent text"].apply(lambda x: len(x.replace("\n", ""))).values
30 | K = ndarray_data["TfIdfCosSimMatrix"]
31 | K = np.ma.masked_array(K, mask=np.diag(np.diag(K)))
32 | assert B.shape[0] == K.shape[0]
33 |
34 | #B = B[[0, 25, 54, 80]]
35 | print B
36 | #K = K[[0, 25, 54, 80]][:,[0, 25, 54, 80]]
37 | print K
38 | K_S = np.ma.masked_array(K, mask=False, hardmask=False)
39 | print K_S
40 | K_V = np.ma.masked_array(K, mask=False, hardmask=False)
41 | print K_V
42 |
43 | print
44 | print
45 |
46 | S = []
47 | B_S = 0
48 | V = range(K.shape[0])
49 | inspected_vertices = set()
50 | f_of_S = 0
51 | for rank in xrange(K.shape[0], 0, -1):
52 | #print "K_S"
53 | #print K_S
54 | #print "S"
55 | #print S
56 | #print "V"
57 | #print V
58 | max_gain = float("-inf")
59 | max_idx = None
60 | max_v = None
61 | max_f_of_S_plus_v = None
62 | for i, v in enumerate(V):
63 | if v in inspected_vertices:
64 | continue
65 | S_tmp = S + [v]
66 | V_tmp = V[:i] + V[i+1:]
67 | #print S_tmp
68 | #print V_tmp
69 | #print K[S_tmp][:, V_tmp]
70 | #print K[S_tmp][:, S_tmp].filled(0).sum()
71 | f_of_S_plus_v = K[S_tmp][:, V_tmp].sum() - \
72 | self.lam * K[S_tmp][:, S_tmp].filled(0).sum()
73 | gain = (f_of_S_plus_v - f_of_S) / (B[v] ** self.scale)
74 |
75 | if gain > max_gain:
76 | max_gain = gain
77 | max_idx = i
78 | max_v = v
79 | max_f_of_S_plus_v = f_of_S_plus_v
80 | #print v, gain
81 |
82 |
83 | #del V[max_idx]
84 |
85 | if max_gain > 0 and B_S + B[max_v] <= self.budget_size:
86 | print "Adding", max_v, "f(S + v) =", max_f_of_S_plus_v
87 | S += [max_v]
88 | del V[max_idx]
89 | f_of_S = max_f_of_S_plus_v
90 | print "B_v", B[max_v], "B_S", B_S, "B_S + B_v", B_S + B[max_v]
91 | B_S += B[max_v]
92 | input_df.ix[max_v, "f:submodular-mmr"] = rank
93 |
94 | inspected_vertices.add(max_v)
95 |
96 |
97 | #else:
98 |
99 |
100 | #print "Iter {} f(S) = {}".format(rank, f_of_S)
101 | #print
102 | #print
103 | #f_cut = K.sum(axis=1)
104 | #print f_cut
105 | #if rank == K.shape[0] - 2:
106 | # break
107 |
108 | return input_df, ndarray_data
109 | self._submodular_mmr = rank
110 |
111 | def process(self, input_df, ndarray_data):
112 | return self._submodular_mmr(input_df, ndarray_data)
113 |
114 | def requires(self):
115 | return ["sent text"]
116 |
117 | def ndarray_requires(self):
118 | return ["TfIdfCosSimMatrix", "RawBOWMatrix"]
119 |
120 | def returns(self):
121 | return ["f:submodular-mmr"]
122 |
123 | def ndarray_returns(self):
124 | return []
125 |
126 | def name(self):
127 | return "SubmodularMMRMixin"
128 |
129 | class MonotoneSubmodularMixin(WordTokenizerMixin):
130 | def build(self):
131 | if not hasattr(self, "k"):
132 | self.k = 5
133 | assert self.k > 0
134 |
135 | if not hasattr(self, "f_of_A") or self.f_of_A is None:
136 | def f_of_A(system, A, V_min_A, e, input_df, ndarray_input):
137 | return len(
138 | set([word for words in input_df.ix[A, "words"].tolist() for word in words]))
139 | self.f_of_A = f_of_A
140 |
141 | def process(self, input_df, ndarray_data):
142 |
143 | input_size = len(input_df)
144 | S = []
145 | V_min_S = [i for i in xrange(input_size)]
146 | f_of_S = 0
147 | for i in xrange(self.k):
148 | arg_max = None
149 | gain_max = 0
150 | f_of_S_max = 0
151 | for pos, elem in enumerate(V_min_S):
152 | S_plus_e = S + [elem]
153 | V_min_S_plus_e = V_min_S[:pos] + V_min_S[pos+1:]
154 | score = self.f_of_A(
155 | self, S_plus_e, V_min_S_plus_e, elem, input_df, ndarray_data)
156 | gain = score - f_of_S
157 |
158 | if gain > gain_max:
159 | arg_max = pos
160 | gain_max = gain
161 | f_of_S_max = score
162 |
163 | if arg_max is not None:
164 | S += [V_min_S[arg_max]]
165 | f_of_S = f_of_S_max
166 | del V_min_S[arg_max]
167 |
168 | input_df.ix[S, "f:monotone-submod"] = 1
169 | input_df.ix[V_min_S, "f:monotone-submod"] = 0
170 |
171 | return input_df, ndarray_data
172 |
173 | def process2(self, input_df, ndarray_data):
174 |
175 | input_size = len(input_df)
176 | S = []
177 | N = set()
178 |
179 | n_of_e = input_df["nuggets"].tolist()
180 | V_min_S = [i for i in xrange(input_size)]
181 | f_of_S = 0
182 |
183 |
184 | for i in xrange(self.k):
185 | arg_max = None
186 | gain_max = 0
187 | for pos, elem in enumerate(V_min_S):
188 | #print "elem", elem
189 | #print "S", S
190 | #print "V_min_S", V_min_S
191 | #print "n(e) =", n_of_e[elem]
192 | n_of_S_U_e = N.union(n_of_e[elem])
193 | #print "S U {e}", S + [elem]
194 | #print "n(S U {e})", n_of_S_U_e
195 |
196 | gain = self._f_of_S(n_of_S_U_e) - f_of_S
197 | #print "gain", gain
198 | #print
199 | if gain > gain_max:
200 | arg_max = pos
201 | gain_max = gain
202 |
203 | if arg_max is not None:
204 | S = S + [V_min_S[arg_max]]
205 | N = N.union(n_of_e[V_min_S[arg_max]])
206 | f_of_S = len(N)
207 |
208 | print "ARG MAX", V_min_S[arg_max]
209 | print "S", S
210 | print "N", N
211 | print "f(S)", f_of_S
212 |
213 | del V_min_S[arg_max]
214 |
215 |
216 | print S
217 | print input_df
218 | print input_size
219 | input_df.ix[S, "f:monotone-submod"] = 1
220 | input_df.ix[V_min_S, "f:monotone-submod"] = 0
221 |
222 |
223 | return input_df, ndarray_data
224 |
225 |
226 | def requires(self):
227 | return ["words"]
228 |
229 | def ndarray_requires(self):
230 | return []
231 |
232 | def returns(self):
233 | return ["f:montone-submod"]
234 |
235 | def ndarray_returns(self):
236 | return []
237 |
238 | def name(self):
239 | return "MonotoneSubmod"
240 |
241 |
242 |
243 |
--------------------------------------------------------------------------------
/sumpy/__init__.py:
--------------------------------------------------------------------------------
1 | import sumpy.io
2 | import sumpy.system
3 | from sumpy.simple import lede, centroid, mmr, textrank, lexrank
4 |
5 |
6 |
7 | #import numpy as np
8 | #from itertools import izip
9 | #import nltk.data
10 | #from nltk.tokenize import WordPunctTokenizer
11 | #from nltk.stem.snowball import EnglishStemmer
12 | #from nltk.corpus import stopwords
13 | #import heapq
14 | #from collections import defaultdict
15 | #
16 | #class DocumentSetReader(object):
17 | # def __init__(self, input=u"filename", preprocessor=None, sentence_processor=None,
18 | # token_processor=None, token_processor_returns=None, stop_filter=None):
19 | #
20 | # if input not in set([u"filename", u"file", u"content"]):
21 | # raise ValueError(
22 | # u"input argument must be 'filename', 'file', or 'content'")
23 | # self.input = input
24 | #
25 | # self.preprocessor = preprocessor
26 | #
27 | # if sentence_processor is None:
28 | # senttok = nltk.data.load('tokenizers/punkt/english.pickle')
29 | # sentence_processor = lambda x: senttok.tokenize(x)
30 | # self.sentence_processor = sentence_processor
31 | #
32 | # if token_processor is None:
33 | # wordtok = WordPunctTokenizer()
34 | # stemmer = EnglishStemmer()
35 | # def default_token_processor(sentence):
36 | # tokens = [[stemmer.stem(word.lower())]
37 | # for word in wordtok.tokenize(sentence)]
38 | # return tokens
39 | # token_processor = default_token_processor
40 | # token_processor_returns = ["token"]
41 | #
42 | # self.token_processor = token_processor
43 | # self.token_processor_returns = token_processor_returns
44 | #
45 | # if stop_filter is None:
46 | # stop = stopwords.words('english')
47 | # stop_filter = lambda token: token in stop or len(token) <= 2
48 | # self.stop_filter = stop_filter
49 | #
50 | # def load_documents(self, documents, names=None):
51 | # max_docs = len(documents)
52 | # if names is None:
53 | # names = ["doc{}".format(n) for n in xrange(max_docs)]
54 | # assert len(names) == len(documents)
55 | #
56 | # sentences = {}
57 | #
58 | # token_type_index = self.token_processor_returns.index(u'token')
59 | # next_sentence_id = 0
60 | #
61 | # for n_doc, (name, document) in enumerate(izip(names, documents)):
62 | # print n_doc
63 | # text = self._read(document)
64 | # for n_sent, sentence in enumerate(self.sentence_processor(text)):
65 | # tokens = {tok_type: list()
66 | # for tok_type in self.token_processor_returns}
67 | #
68 | # for token_types in self.token_processor(sentence):
69 | # if self.stop_filter(token_types[token_type_index]):
70 | # continue
71 | # for tok_type, token in izip(
72 | # self.token_processor_returns, token_types):
73 | # tokens[tok_type].append(token)
74 | # if len(tokens[u'token']) == 0:
75 | # continue
76 | #
77 | # sentences[next_sentence_id] = {u"name": name,
78 | # u"n_doc": n_doc,
79 | # u"n_sent": n_sent,
80 | # u"tokens": tokens,
81 | # u"sentence": sentence}
82 | # next_sentence_id += 1
83 | # return sentences
84 | #
85 | # def _read(self, document):
86 | #
87 | # if self.input == u"filename":
88 | # with open(document, u"r") as f:
89 | # text = ''.join(f.readlines())
90 | # elif self.input == u"file":
91 | # text = ''.join(document.readlines())
92 | # elif self.input == u"content":
93 | # text = document
94 | #
95 | # if isinstance(text, str):
96 | # text = text.decode(u"utf-8")
97 | #
98 | # if self.preprocessor is not None:
99 | # text = self.preprocessor(text)
100 | #
101 | # return text
102 | #
103 | #class SentenceRanker(object):
104 | # pass
105 | #
106 | #class SumBasicRanker(SentenceRanker):
107 | #
108 | # def rank(self, summary_input):
109 | # print "RANKING"
110 | # ordered = []
111 | # unigram_probs = self._build_unigram_probs(summary_input)
112 | #
113 | # heap = [(1-prob, word) for word, prob in unigram_probs.items()]
114 | # heapq.heapify(heap)
115 | #
116 | # weights = []
117 | # token2sentids = defaultdict(list)
118 | #
119 | # covered = set()
120 | # n_sents = len(summary_input)
121 | #
122 | # print "Debug"
123 | # for sent_id in sorted(summary_input.keys()):
124 | # weight = 0
125 | # length = 0
126 | # print sent_id
127 | # for token in summary_input[sent_id][u'tokens'][u'token']:
128 | # weight += unigram_probs[token]
129 | # token2sentids[token].append(sent_id)
130 | # length += 1
131 | # print u"{}/{}".format(token, weight),
132 | # print
133 | # weight /= float(length)
134 | # print weight
135 | # weights.append(weight)
136 | #
137 | # while len(ordered) != n_sents:
138 | # # Get highest prob word (1)
139 | # prob, word = heapq.heappop(heap)
140 | #
141 | # # Get highest scored sentence containing highest prob word
142 | # sent_ids = token2sentids[word]
143 | # sent_ids.sort(key=lambda x: weights[x])
144 | #
145 | # for sent_id in sent_ids:
146 | # print sent_id, weights[sent_id]
147 | # print summary_input[sent_id][u'sentence']
148 | # break
149 | #
150 | # sent_id = sent_ids.pop()
151 | # while sent_id in covered:
152 | # if len(sent_ids) == 0:
153 | # break
154 | # sent_id = sent_ids.pop()
155 | #
156 | # if len(sent_ids) == 0:
157 | # continue
158 | #
159 | # ordered.append(sent_id)
160 | # covered.add(sent_id)
161 | #
162 | # # for sent_id in sent_ids:
163 | # # weights[sent_id] = (1 - prob)
164 | # heapq.heappush(heap, (1 - (1 - prob)**2, word))
165 | # print word, weights
166 | # print summary_input[sent_id][u'sentence']
167 | # #for sent_id in sent_ids:
168 | # # print sent_id, weights[sent_id]
169 | # #for prob, word in heapq.heappop(heap)
170 | # #def
171 | #
172 | # def _build_unigram_probs(self, summary_input):
173 | # probs = {}
174 | # total = 0
175 | # for sentence in summary_input.values():
176 | # for token in sentence[u'tokens'][u'token']:
177 | # probs[token] = probs.get(token, 0) + 1
178 | # total += 1
179 | #
180 | # assert total > 1
181 | # total = float(total)
182 | # for key in probs.keys():
183 | # probs[key] /= total
184 | # return probs
185 | #
186 | #class PageRank(object):
187 | #
188 | # def __init__(self, max_iters=100, tol=1E-4, d=.85):
189 | # self.max_iters = max_iters
190 | # self.tol = tol
191 | # self.d = d
192 | #
193 | # def rank(self, K):
194 | # n_nodes = K.shape[0]
195 | # r = np.ones((n_nodes, 1), dtype=np.float64) / n_nodes
196 | # #r /= np.sum(r)
197 | # last_r = np.ones((n_nodes, 1))
198 | # K_hat = (self.d * K) + \
199 | # (float(1 - self.d) / n_nodes) * np.ones((n_nodes, n_nodes))
200 | #
201 | # converged = False
202 | # for n_iter in xrange(self.max_iters):
203 | # last_r = r
204 | # r = np.dot(K_hat, r)
205 | # r /= np.sum(r)
206 | #
207 | # if (np.abs(r - last_r) < self.tol).any():
208 | # converged = True
209 | # break
210 | #
211 | # if not converged:
212 | # print "Warning: PageRank not converged after %d iters" % self.max_iters
213 | #
214 | # return r
215 | #
216 | #
217 | #
218 | #class LexRank(object):
219 | # pass
220 | #
221 | #class TextRank(object):
222 | #
223 | # def summarize(self, text_units):
224 | # pass
225 | #
226 | # def sentence_tokenizer(self):
227 | # pass
228 | # def word_tokenizer(self):
229 | # pass
230 |
--------------------------------------------------------------------------------
/sumpy/annotators/_feature_extractors.py:
--------------------------------------------------------------------------------
1 | from sumpy.annotators import (SentenceTokenizerMixin, BinaryBOWMixin,
2 | TfIdfMixin, TfIdfCosineSimilarityMixin)
3 | import numpy as np
4 | from itertools import combinations
5 |
6 | class LedeMixin(SentenceTokenizerMixin):
7 |
8 | def build(self):
9 | pass
10 |
11 | def process(self, input_df, ndarray_data):
12 | input_df[u"f:lede"] = 0
13 | for doc_id, group in input_df.groupby("doc id"):
14 | idx = group["sent id"].argmin()
15 | input_df.ix[idx, u"f:lede"] = 1
16 | return input_df, ndarray_data
17 |
18 | def requires(self):
19 | return ["sent id",]
20 |
21 | def ndarray_requires(self):
22 | return []
23 |
24 | def returns(self):
25 | return ["f:lede"]
26 |
27 | def ndarray_returns(self):
28 | return []
29 |
30 | def name(self):
31 | return "LedeMixin"
32 |
33 | class TextRankMixin(BinaryBOWMixin):
34 |
35 | def build(self):
36 | if not hasattr(self, "directed"):
37 | self.directed = u"undirected"
38 | assert self.directed in ["undirected",] # [u"directed", "undirected"]
39 | # TODO actually implement directed
40 |
41 | if not hasattr(self, "d"):
42 | self.d = .85
43 | assert 0 < self.d and self.d < 1
44 |
45 | if not hasattr(self, "max_iters"):
46 | self.max_iters = 20
47 | assert isinstance(self.max_iters, int) and self.max_iters > 0
48 |
49 | if not hasattr(self, "tol"):
50 | self.tol = .0001
51 | assert 0 < self.tol
52 |
53 | def textrank(input_df, ndarray_data):
54 | max_sents = input_df.shape[0]
55 | l = input_df["words"].apply(len).tolist()
56 | K = self._textrank_kernel(
57 | l, ndarray_data["BinaryBOWMatrix"], directed=self.directed)
58 | M_hat = (self.d * K) + \
59 | (float(1 - self.d) / max_sents) * np.ones(
60 | (max_sents, max_sents))
61 | M_hat /= np.sum(M_hat, axis=0)
62 | r = np.ones((max_sents), dtype=np.float64) / max_sents
63 |
64 | converged = False
65 | for n_iter in xrange(self.max_iters):
66 | last_r = r
67 | r = np.dot(M_hat, r)
68 |
69 | if (np.abs(r - last_r) < self.tol).any():
70 | converged = True
71 | break
72 |
73 | if not converged:
74 | print "warning:",
75 | print "textrank failed to converged after {} iters".format(
76 | self.max_iters)
77 | input_df["f:textrank"] = r
78 | return input_df, ndarray_data
79 | self._textrank = textrank
80 |
81 | def process(self, input_df, ndarray_data):
82 | return self._textrank(input_df, ndarray_data)
83 |
84 | def _textrank_kernel(self, l, X, directed=u"undirected"):
85 | """Compute similarity matrix K ala text rank paper. Should this be
86 | a ufunc???"""
87 | #X = X.todense()
88 | #X[X > 0] = 1
89 | N = X.dot(X.T)
90 |
91 | n_sents = X.shape[0]
92 | M = np.zeros((n_sents, n_sents), dtype=np.float64)
93 | for i, j in combinations(xrange(n_sents), 2):
94 | #s_i = word_sets[i]
95 | #s_j = word_sets[j]
96 | val = N[i,j] #len(s_i.intersection(s_j))
97 | val /= np.log(l[i] * l[j])
98 | M[i,j] = val
99 | M[j,i] = val
100 | return M
101 |
102 | def requires(self):
103 | return ["words",]
104 |
105 | def ndarray_requires(self):
106 | return ["BinaryBOWMatrix",]
107 |
108 | def returns(self):
109 | return ["f:textrank"]
110 |
111 | def ndarray_returns(self):
112 | return []
113 |
114 | def name(self):
115 | return "TextRankMixin"
116 |
117 | class LexRankMixin(TfIdfCosineSimilarityMixin):
118 |
119 | def build(self):
120 | if not hasattr(self, "d"):
121 | self.d = .85
122 | assert 0 < self.d and self.d < 1
123 |
124 | if not hasattr(self, "max_iters"):
125 | self.max_iters = 20
126 | assert isinstance(self.max_iters, int) and self.max_iters > 0
127 |
128 | if not hasattr(self, "tol"):
129 | self.tol = .0001
130 | assert 0 < self.tol
131 |
132 | def lexrank(input_df, ndarray_data):
133 | max_sents = input_df.shape[0]
134 | #l = input_df["words"].apply(len).tolist()
135 | K = ndarray_data["TfIdfCosSimMatrix"]
136 | M_hat = (self.d * K) + \
137 | (float(1 - self.d) / max_sents) * np.ones(
138 | (max_sents, max_sents))
139 | M_hat /= np.sum(M_hat, axis=0)
140 | r = np.ones((max_sents), dtype=np.float64) / max_sents
141 |
142 | converged = False
143 | for n_iter in xrange(self.max_iters):
144 | last_r = r
145 | r = np.dot(M_hat, r)
146 |
147 | if (np.abs(r - last_r) < self.tol).any():
148 | converged = True
149 | break
150 |
151 | if not converged:
152 | print "warning:",
153 | print "lexrank failed to converged after {} iters".format(
154 | self.max_iters)
155 | input_df["f:lexrank"] = r
156 | return input_df, ndarray_data
157 | self._lexrank = lexrank
158 |
159 | def process(self, input_df, ndarray_data):
160 | return self._lexrank(input_df, ndarray_data)
161 |
162 | def requires(self):
163 | return []
164 |
165 | def ndarray_requires(self):
166 | return ["TfIdfCosSimMatrix",]
167 |
168 | def returns(self):
169 | return ["f:lexrank"]
170 |
171 | def ndarray_returns(self):
172 | return []
173 |
174 | def name(self):
175 | return "LexRankMixin"
176 |
177 | class CentroidMixin(TfIdfMixin, BinaryBOWMixin):
178 |
179 | def build(self):
180 | pass
181 |
182 | def process(self, input_df, ndarray_data):
183 | B = ndarray_data["BinaryBOWMatrix"]
184 | X = ndarray_data["TfIdfMatrix"]
185 | c = X.sum(axis=0)
186 | assert c.shape[1] == X.shape[1]
187 | input_df["f:centroid"] = B.dot(c.T)
188 | return input_df, ndarray_data
189 |
190 | def requires(self):
191 | return []
192 |
193 | def ndarray_requires(self):
194 | return ["TfIdfMatrix", "BinaryBOWMatrix"]
195 |
196 | def returns(self):
197 | return ["f:centroid"]
198 |
199 | def ndarray_returns(self):
200 | return []
201 |
202 | def name(self):
203 | return "CentroidMixin"
204 |
205 | class MMRMixin(TfIdfCosineSimilarityMixin):
206 |
207 | def build(self):
208 | if not hasattr(self, "lam"):
209 | self.lam = .7
210 | assert 0 < self.lam and self.lam < 1
211 |
212 | def rank(input_df, ndarray_data):
213 | K = ndarray_data["TfIdfCosSimMatrix"]
214 | K = np.ma.masked_array(K, mask=np.diag(np.diag(K)))
215 | K_input = np.ma.masked_array(
216 | K, mask=False, fill_value=0, hardmask=False)
217 | K_summ = np.ma.masked_array(
218 | K, mask=True, fill_value=0, hardmask=False)
219 |
220 | w1 = self.lam
221 | w2 = (1 - w1)
222 | for rank in range(K.shape[0], 0, -1):
223 | if rank == K.shape[0]:
224 | K_input_max = K_input.max(axis=1).filled(float("-inf"))
225 | idx = np.argmax(K_input_max)
226 | else:
227 | K_summ_max = K_summ.max(axis=1).filled(0)
228 | K_input_max = K_input.max(axis=1).filled(float("inf"))
229 |
230 | S = w1 * K_summ_max - w2 * K_input_max
231 | idx = np.argmax(S)
232 |
233 | K_summ.mask[:,idx] = False
234 | K_summ.mask[idx, idx] = True
235 | K_input.mask[idx,:] = True
236 |
237 | input_df.ix[idx, "f:mmr"] = rank
238 |
239 | return input_df, ndarray_data
240 | self._mmr = rank
241 |
242 | def process(self, input_df, ndarray_data):
243 | return self._mmr(input_df, ndarray_data)
244 |
245 | def requires(self):
246 | return []
247 |
248 | def ndarray_requires(self):
249 | return ["TfIdfCosSimMatrix"]
250 |
251 | def returns(self):
252 | return ["f:mmr"]
253 |
254 | def ndarray_returns(self):
255 | return []
256 |
257 | def name(self):
258 | return "MMRMixin"
259 |
--------------------------------------------------------------------------------
/sumpy/data/duc07_task2.json:
--------------------------------------------------------------------------------
1 | {
2 | "D0703A": {
3 | "A": {
4 | "inputs": [
5 | ],
6 | "models": [
7 | "D0703-A.M.100.A.A",
8 | "D0703-A.M.100.A.C",
9 | "D0703-A.M.100.A.D",
10 | "D0703-A.M.100.A.J",
11 | ],
12 | },
13 | "B": {
14 | "inputs": [
15 | ],
16 | "models": [
17 | "D0703-B.M.100.A.A",
18 | "D0703-B.M.100.A.C",
19 | "D0703-B.M.100.A.D",
20 | "D0703-B.M.100.A.J",
21 | ],
22 | },
23 | "C": {
24 | "inputs": [
25 | ],
26 | "models": [
27 | "D0703-C.M.100.A.A",
28 | "D0703-C.M.100.A.C",
29 | "D0703-C.M.100.A.D",
30 | "D0703-C.M.100.A.J",
31 | ],
32 | },
33 | },
34 | "D0706B": {
35 | "A": {
36 | "inputs": [
37 | ],
38 | "models": [
39 | "D0706-A.M.100.B.B",
40 | "D0706-A.M.100.B.D",
41 | "D0706-A.M.100.B.E",
42 | "D0706-A.M.100.B.I",
43 | ],
44 | },
45 | "B": {
46 | "inputs": [
47 | ],
48 | "models": [
49 | "D0706-B.M.100.B.B",
50 | "D0706-B.M.100.B.D",
51 | "D0706-B.M.100.B.E",
52 | "D0706-B.M.100.B.I",
53 | ],
54 | },
55 | "C": {
56 | "inputs": [
57 | ],
58 | "models": [
59 | "D0706-C.M.100.B.B",
60 | "D0706-C.M.100.B.D",
61 | "D0706-C.M.100.B.E",
62 | "D0706-C.M.100.B.I",
63 | ],
64 | },
65 | },
66 | "D0711C": {
67 | "A": {
68 | "inputs": [
69 | ],
70 | "models": [
71 | "D0711-A.M.100.C.A",
72 | "D0711-A.M.100.C.B",
73 | "D0711-A.M.100.C.C",
74 | "D0711-A.M.100.C.F",
75 | ],
76 | },
77 | "B": {
78 | "inputs": [
79 | ],
80 | "models": [
81 | "D0711-B.M.100.C.A",
82 | "D0711-B.M.100.C.B",
83 | "D0711-B.M.100.C.C",
84 | "D0711-B.M.100.C.F",
85 | ],
86 | },
87 | "C": {
88 | "inputs": [
89 | ],
90 | "models": [
91 | "D0711-C.M.100.C.A",
92 | "D0711-C.M.100.C.B",
93 | "D0711-C.M.100.C.C",
94 | "D0711-C.M.100.C.F",
95 | ],
96 | },
97 | },
98 | "D0716D": {
99 | "A": {
100 | "inputs": [
101 | ],
102 | "models": [
103 | "D0716-A.M.100.D.C",
104 | "D0716-A.M.100.D.D",
105 | "D0716-A.M.100.D.E",
106 | "D0716-A.M.100.D.F",
107 | ],
108 | },
109 | "B": {
110 | "inputs": [
111 | ],
112 | "models": [
113 | "D0716-B.M.100.D.C",
114 | "D0716-B.M.100.D.D",
115 | "D0716-B.M.100.D.E",
116 | "D0716-B.M.100.D.F",
117 | ],
118 | },
119 | "C": {
120 | "inputs": [
121 | ],
122 | "models": [
123 | "D0716-C.M.100.D.C",
124 | "D0716-C.M.100.D.D",
125 | "D0716-C.M.100.D.E",
126 | "D0716-C.M.100.D.F",
127 | ],
128 | },
129 | },
130 | "D0721E": {
131 | "A": {
132 | "inputs": [
133 | ],
134 | "models": [
135 | "D0721-A.M.100.E.B",
136 | "D0721-A.M.100.E.C",
137 | "D0721-A.M.100.E.E",
138 | "D0721-A.M.100.E.G",
139 | ],
140 | },
141 | "B": {
142 | "inputs": [
143 | ],
144 | "models": [
145 | "D0721-B.M.100.E.B",
146 | "D0721-B.M.100.E.C",
147 | "D0721-B.M.100.E.E",
148 | "D0721-B.M.100.E.G",
149 | ],
150 | },
151 | "C": {
152 | "inputs": [
153 | ],
154 | "models": [
155 | "D0721-C.M.100.E.B",
156 | "D0721-C.M.100.E.C",
157 | "D0721-C.M.100.E.E",
158 | "D0721-C.M.100.E.G",
159 | ],
160 | },
161 | },
162 | "D0726F": {
163 | "A": {
164 | "inputs": [
165 | ],
166 | "models": [
167 | "D0726-A.M.100.F.A",
168 | "D0726-A.M.100.F.E",
169 | "D0726-A.M.100.F.F",
170 | "D0726-A.M.100.F.G",
171 | ],
172 | },
173 | "B": {
174 | "inputs": [
175 | ],
176 | "models": [
177 | "D0726-B.M.100.F.A",
178 | "D0726-B.M.100.F.E",
179 | "D0726-B.M.100.F.F",
180 | "D0726-B.M.100.F.G",
181 | ],
182 | },
183 | "C": {
184 | "inputs": [
185 | ],
186 | "models": [
187 | "D0726-C.M.100.F.A",
188 | "D0726-C.M.100.F.E",
189 | "D0726-C.M.100.F.F",
190 | "D0726-C.M.100.F.G",
191 | ],
192 | },
193 | },
194 | "D0727G": {
195 | "A": {
196 | "inputs": [
197 | ],
198 | "models": [
199 | "D0727-A.M.100.G.A",
200 | "D0727-A.M.100.G.F",
201 | "D0727-A.M.100.G.G",
202 | "D0727-A.M.100.G.H",
203 | ],
204 | },
205 | "B": {
206 | "inputs": [
207 | ],
208 | "models": [
209 | "D0727-B.M.100.G.A",
210 | "D0727-B.M.100.G.F",
211 | "D0727-B.M.100.G.G",
212 | "D0727-B.M.100.G.H",
213 | ],
214 | },
215 | "C": {
216 | "inputs": [
217 | ],
218 | "models": [
219 | "D0727-C.M.100.G.A",
220 | "D0727-C.M.100.G.F",
221 | "D0727-C.M.100.G.G",
222 | "D0727-C.M.100.G.H",
223 | ],
224 | },
225 | },
226 | "D0736H": {
227 | "A": {
228 | "inputs": [
229 | ],
230 | "models": [
231 | "D0736-A.M.100.H.G",
232 | "D0736-A.M.100.H.H",
233 | "D0736-A.M.100.H.I",
234 | "D0736-A.M.100.H.J",
235 | ],
236 | },
237 | "B": {
238 | "inputs": [
239 | ],
240 | "models": [
241 | "D0736-B.M.100.H.G",
242 | "D0736-B.M.100.H.H",
243 | "D0736-B.M.100.H.I",
244 | "D0736-B.M.100.H.J",
245 | ],
246 | },
247 | "C": {
248 | "inputs": [
249 | ],
250 | "models": [
251 | "D0736-C.M.100.H.G",
252 | "D0736-C.M.100.H.H",
253 | "D0736-C.M.100.H.I",
254 | "D0736-C.M.100.H.J",
255 | ],
256 | },
257 | },
258 | "D0740I": {
259 | "A": {
260 | "inputs": [
261 | ],
262 | "models": [
263 | "D0740-A.M.100.I.D",
264 | "D0740-A.M.100.I.H",
265 | "D0740-A.M.100.I.I",
266 | "D0740-A.M.100.I.J",
267 | ],
268 | },
269 | "B": {
270 | "inputs": [
271 | ],
272 | "models": [
273 | "D0740-B.M.100.I.D",
274 | "D0740-B.M.100.I.H",
275 | "D0740-B.M.100.I.I",
276 | "D0740-B.M.100.I.J",
277 | ],
278 | },
279 | "C": {
280 | "inputs": [
281 | ],
282 | "models": [
283 | "D0740-C.M.100.I.D",
284 | "D0740-C.M.100.I.H",
285 | "D0740-C.M.100.I.I",
286 | "D0740-C.M.100.I.J",
287 | ],
288 | },
289 | },
290 | "D0743J": {
291 | "A": {
292 | "inputs": [
293 | ],
294 | "models": [
295 | "D0743-A.M.100.J.B",
296 | "D0743-A.M.100.J.H",
297 | "D0743-A.M.100.J.I",
298 | "D0743-A.M.100.J.J",
299 | ],
300 | },
301 | "B": {
302 | "inputs": [
303 | ],
304 | "models": [
305 | "D0743-B.M.100.J.B",
306 | "D0743-B.M.100.J.H",
307 | "D0743-B.M.100.J.I",
308 | "D0743-B.M.100.J.J",
309 | ],
310 | },
311 | "C": {
312 | "inputs": [
313 | ],
314 | "models": [
315 | "D0743-C.M.100.J.B",
316 | "D0743-C.M.100.J.H",
317 | "D0743-C.M.100.J.I",
318 | "D0743-C.M.100.J.J",
319 | ],
320 | },
321 | },
322 | }
323 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/sumpy/data/duc03_task2.json:
--------------------------------------------------------------------------------
1 | {
2 | "D30003": {
3 | "inputs": [
4 | "APW19981017.0151",
5 | "NYT19981017.0177",
6 | "APW19981017.0306",
7 | "APW19981017.0477",
8 | "APW19981017.0507",
9 | "NYT19981018.0098",
10 | "NYT19981018.0123",
11 | "NYT19981018.0160",
12 | "NYT19981018.0185",
13 | "APW19981018.0410"
14 | ],
15 | "models": [
16 | "D30003.M.100.T.B",
17 | "D30003.M.100.T.C",
18 | "D30003.M.100.T.D",
19 | "D30003.M.100.T.E"
20 | ]
21 | },
22 | "D30005": {
23 | "inputs": [
24 | "NYT19981007.0383",
25 | "NYT19981007.0384",
26 | "NYT19981016.0283",
27 | "NYT19981017.0140",
28 | "NYT19981021.0378",
29 | "NYT19981022.0453",
30 | "NYT19981104.0491",
31 | "APW19981104.0772",
32 | "APW19981105.0282",
33 | "NYT19981110.0432"
34 | ],
35 | "models": [
36 | "D30005.M.100.T.G",
37 | "D30005.M.100.T.H",
38 | "D30005.M.100.T.I",
39 | "D30005.M.100.T.J"
40 | ]
41 | },
42 | "D30010": {
43 | "inputs": [
44 | "NYT19981106.0468",
45 | "NYT19981106.0494",
46 | "APW19981106.0520",
47 | "APW19981106.0572",
48 | "APW19981106.0851",
49 | "APW19981106.1126",
50 | "APW19981107.0116",
51 | "APW19981107.0118",
52 | "APW19981107.0131",
53 | "APW19981107.0143"
54 | ],
55 | "models": [
56 | "D30010.M.100.T.F",
57 | "D30010.M.100.T.G",
58 | "D30010.M.100.T.H",
59 | "D30010.M.100.T.I"
60 | ]
61 | },
62 | "D30012": {
63 | "inputs": [
64 | "NYT19981012.0242",
65 | "NYT19981012.0245",
66 | "NYT19981115.0065",
67 | "NYT19981115.0091",
68 | "NYT19981116.0199",
69 | "NYT19981117.0290",
70 | "APW19981117.0597",
71 | "APW19981117.0914",
72 | "APW19981117.1226",
73 | "NYT19981118.0287"
74 | ],
75 | "models": [
76 | "D30012.M.100.T.A",
77 | "D30012.M.100.T.B",
78 | "D30012.M.100.T.C",
79 | "D30012.M.100.T.D"
80 | ]
81 | },
82 | "D30016": {
83 | "inputs": [
84 | "NYT19981001.0440",
85 | "APW19981006.0543",
86 | "APW19981027.1075",
87 | "APW19981027.1082",
88 | "NYT19981029.0472",
89 | "APW19981105.0853",
90 | "NYT19981112.0477",
91 | "NYT19981118.0185",
92 | "APW19981122.0381",
93 | "APW19981122.0382"
94 | ],
95 | "models": [
96 | "D30016.M.100.T.A",
97 | "D30016.M.100.T.B",
98 | "D30016.M.100.T.C",
99 | "D30016.M.100.T.J"
100 | ]
101 | },
102 | "D30020": {
103 | "inputs": [
104 | "APW19981103.0271",
105 | "APW19981116.0496",
106 | "APW19981119.0262",
107 | "APW19981125.0256",
108 | "APW19981202.0568",
109 | "APW19981204.0252",
110 | "APW19981206.0169",
111 | "APW19981206.0201",
112 | "APW19981206.0390",
113 | "APW19981206.0393"
114 | ],
115 | "models": [
116 | "D30020.M.100.T.E",
117 | "D30020.M.100.T.F",
118 | "D30020.M.100.T.G",
119 | "D30020.M.100.T.H"
120 | ]
121 | },
122 | "D30025": {
123 | "inputs": [
124 | "NYT19981004.0131",
125 | "NYT19981004.0152",
126 | "NYT19981005.0331",
127 | "APW19981005.1097",
128 | "NYT19981006.0468",
129 | "APW19981006.1099",
130 | "NYT19981025.0180",
131 | "APW19981025.0395",
132 | "NYT19981110.0322",
133 | "APW19981231.0565"
134 | ],
135 | "models": [
136 | "D30025.M.100.T.D",
137 | "D30025.M.100.T.E",
138 | "D30025.M.100.T.F",
139 | "D30025.M.100.T.G"
140 | ]
141 | },
142 | "D30028": {
143 | "inputs": [
144 | "NYT19981003.0058",
145 | "APW19981003.0487",
146 | "APW19981003.0645",
147 | "APW19981003.0646",
148 | "APW19981003.0741",
149 | "NYT19981004.0069",
150 | "APW19981004.0146",
151 | "APW19981004.0172",
152 | "APW19981004.0175",
153 | "APW19981004.0180"
154 | ],
155 | "models": [
156 | "D30028.M.100.T.C",
157 | "D30028.M.100.T.D",
158 | "D30028.M.100.T.E",
159 | "D30028.M.100.T.F"
160 | ]
161 | },
162 | "D30034": {
163 | "inputs": [
164 | "NYT19981029.0389",
165 | "APW19981101.0202",
166 | "APW19981104.0812",
167 | "APW19981106.0542",
168 | "APW19981106.0551",
169 | "APW19981111.0598",
170 | "APW19981112.0549",
171 | "APW19981115.0219",
172 | "APW19981120.0282",
173 | "NYT19981124.0353"
174 | ],
175 | "models": [
176 | "D30034.M.100.T.A",
177 | "D30034.M.100.T.B",
178 | "D30034.M.100.T.I",
179 | "D30034.M.100.T.J"
180 | ]
181 | },
182 | "D30040": {
183 | "inputs": [
184 | "APW19981124.0254",
185 | "APW19981124.0256",
186 | "NYT19981124.0267",
187 | "APW19981205.0220",
188 | "NYT19981206.0110",
189 | "NYT19981206.0144",
190 | "APW19981229.0756",
191 | "APW19981229.0763",
192 | "APW19981230.0983",
193 | "APW19981230.0991"
194 | ],
195 | "models": [
196 | "D30040.M.100.T.B",
197 | "D30040.M.100.T.C",
198 | "D30040.M.100.T.D",
199 | "D30040.M.100.T.E"
200 | ]
201 | },
202 | "D30042": {
203 | "inputs": [
204 | "APW19981020.1108",
205 | "NYT19981021.0303",
206 | "APW19981028.0445",
207 | "NYT19981031.0088",
208 | "APW19981123.1112",
209 | "APW19981123.1153",
210 | "APW19981125.0279",
211 | "APW19981125.0886",
212 | "APW19981125.0903",
213 | "APW19981129.0652"
214 | ],
215 | "models": [
216 | "D30042.M.100.T.A",
217 | "D30042.M.100.T.B",
218 | "D30042.M.100.T.C",
219 | "D30042.M.100.T.D"
220 | ]
221 | },
222 | "D30044": {
223 | "inputs": [
224 | "NYT19981021.0318",
225 | "APW19981021.0554",
226 | "APW19981104.0265",
227 | "APW19981104.0525",
228 | "APW19981106.1119",
229 | "APW19981110.0230",
230 | "APW19981113.0541",
231 | "APW19981113.0895",
232 | "APW19981113.0896",
233 | "APW19981114.0178"
234 | ],
235 | "models": [
236 | "D30044.M.100.T.A",
237 | "D30044.M.100.T.H",
238 | "D30044.M.100.T.I",
239 | "D30044.M.100.T.J"
240 | ]
241 | },
242 | "D30048": {
243 | "inputs": [
244 | "NYT19981001.0351",
245 | "NYT19981003.0074",
246 | "APW19981003.0705",
247 | "NYT19981004.0131",
248 | "NYT19981004.0152",
249 | "NYT19981005.0331",
250 | "NYT19981005.0365",
251 | "NYT19981006.0468",
252 | "APW19981014.0523",
253 | "NYT19981016.0286"
254 | ],
255 | "models": [
256 | "D30048.M.100.T.G",
257 | "D30048.M.100.T.H",
258 | "D30048.M.100.T.I",
259 | "D30048.M.100.T.J"
260 | ]
261 | },
262 | "D30050": {
263 | "inputs": [
264 | "NYT19981003.0061",
265 | "NYT19981004.0121",
266 | "NYT19981004.0125",
267 | "NYT19981005.0379",
268 | "NYT19981005.0445",
269 | "NYT19981007.0352",
270 | "NYT19981007.0353",
271 | "NYT19981007.0355",
272 | "NYT19981007.0395",
273 | "NYT19981008.0467"
274 | ],
275 | "models": [
276 | "D30050.M.100.T.A",
277 | "D30050.M.100.T.B",
278 | "D30050.M.100.T.C",
279 | "D30050.M.100.T.J"
280 | ]
281 | },
282 | "D30051": {
283 | "inputs": [
284 | "NYT19981001.0377",
285 | "APW19981012.0791",
286 | "APW19981014.0564",
287 | "APW19981016.0667",
288 | "APW19981021.0246",
289 | "APW19981029.0281",
290 | "APW19981104.0524",
291 | "APW19981104.0537",
292 | "APW19981105.0609",
293 | "APW19981107.0700"
294 | ],
295 | "models": [
296 | "D30051.M.100.T.F",
297 | "D30051.M.100.T.G",
298 | "D30051.M.100.T.H",
299 | "D30051.M.100.T.I"
300 | ]
301 | },
302 | "D30056": {
303 | "inputs": [
304 | "APW19981004.0717",
305 | "APW19981005.0718",
306 | "APW19981014.0284",
307 | "APW19981026.0225",
308 | "APW19981208.0286",
309 | "NYT19981208.0294",
310 | "APW19981208.0313",
311 | "APW19981208.0315",
312 | "APW19981208.0876"
313 | ],
314 | "models": [
315 | "D30056.M.100.T.E",
316 | "D30056.M.100.T.F",
317 | "D30056.M.100.T.G",
318 | "D30056.M.100.T.H"
319 | ]
320 | },
321 | "D31001": {
322 | "inputs": [
323 | "APW19981008.0841",
324 | "APW19981026.0485",
325 | "APW19981026.0787",
326 | "APW19981028.0231",
327 | "NYT19981028.0331",
328 | "NYT19981029.0366",
329 | "NYT19981031.0150",
330 | "APW19981031.0742",
331 | "NYT19981107.0056",
332 | "NYT19981107.0057"
333 | ],
334 | "models": [
335 | "D31001.M.100.T.D",
336 | "D31001.M.100.T.E",
337 | "D31001.M.100.T.F",
338 | "D31001.M.100.T.G"
339 | ]
340 | },
341 | "D31002": {
342 | "inputs": [
343 | "NYT19981003.0082",
344 | "APW19981003.0170",
345 | "APW19981003.0180",
346 | "NYT19981003.0187",
347 | "APW19981003.0470",
348 | "APW19981003.0473",
349 | "APW19981003.0492",
350 | "NYT19981004.0056",
351 | "APW19981004.0165",
352 | "APW19981004.0171"
353 | ],
354 | "models": [
355 | "D31002.M.100.T.A",
356 | "D31002.M.100.T.B",
357 | "D31002.M.100.T.I",
358 | "D31002.M.100.T.J"
359 | ]
360 | },
361 | "D31009": {
362 | "inputs": [
363 | "NYT19981125.0347",
364 | "APW19981125.0544",
365 | "APW19981125.0898",
366 | "APW19981126.0707",
367 | "APW19981126.0971",
368 | "APW19981126.1022",
369 | "APW19981129.0435",
370 | "APW19981129.0625",
371 | "APW19981130.0508",
372 | "APW19981202.0281"
373 | ],
374 | "models": [
375 | "D31009.M.100.T.A",
376 | "D31009.M.100.T.H",
377 | "D31009.M.100.T.I",
378 | "D31009.M.100.T.J"
379 | ]
380 | },
381 | "D31010": {
382 | "inputs": [
383 | "APW19981123.0259",
384 | "APW19981123.0532",
385 | "APW19981123.1118",
386 | "APW19981125.0278",
387 | "NYT19981125.0289",
388 | "APW19981130.0222",
389 | "APW19981203.0965",
390 | "APW19981203.0970",
391 | "APW19981205.0353",
392 | "APW19981205.0560"
393 | ],
394 | "models": [
395 | "D31010.M.100.T.C",
396 | "D31010.M.100.T.D",
397 | "D31010.M.100.T.E",
398 | "D31010.M.100.T.F"
399 | ]
400 | },
401 | "D31011": {
402 | "inputs": [
403 | "NYT19981022.0367",
404 | "APW19981023.0551",
405 | "APW19981024.0164",
406 | "APW19981024.0343",
407 | "APW19981025.0231",
408 | "APW19981025.0412",
409 | "APW19981025.0922",
410 | "APW19981029.0560",
411 | "APW19981104.0507",
412 | "APW19981112.0305"
413 | ],
414 | "models": [
415 | "D31011.M.100.T.G",
416 | "D31011.M.100.T.H",
417 | "D31011.M.100.T.I",
418 | "D31011.M.100.T.J"
419 | ]
420 | },
421 | "D31013": {
422 | "inputs": [
423 | "NYT19981024.0136",
424 | "NYT19981024.0193",
425 | "NYT19981025.0178",
426 | "NYT19981025.0186",
427 | "NYT19981025.0188",
428 | "NYT19981025.0236",
429 | "NYT19981025.0239",
430 | "NYT19981025.0249",
431 | "NYT19981026.0446",
432 | "NYT19981027.0421"
433 | ],
434 | "models": [
435 | "D31013.M.100.T.B",
436 | "D31013.M.100.T.C",
437 | "D31013.M.100.T.D",
438 | "D31013.M.100.T.E"
439 | ]
440 | },
441 | "D31022": {
442 | "inputs": [
443 | "NYT19981030.0329",
444 | "APW19981030.1037",
445 | "APW19981030.1041",
446 | "APW19981030.1046",
447 | "APW19981030.1066",
448 | "APW19981031.0314",
449 | "APW19981031.0551",
450 | "APW19981101.0536",
451 | "APW19981101.0556",
452 | "APW19981102.0190"
453 | ],
454 | "models": [
455 | "D31022.M.100.T.F",
456 | "D31022.M.100.T.G",
457 | "D31022.M.100.T.H",
458 | "D31022.M.100.T.I"
459 | ]
460 | },
461 | "D31027": {
462 | "inputs": [
463 | "APW19981018.0638",
464 | "APW19981022.0848",
465 | "APW19981023.0519",
466 | "APW19981023.1147",
467 | "APW19981024.0182",
468 | "APW19981024.0186",
469 | "APW19981025.0209",
470 | "APW19981025.0210",
471 | "APW19981025.0218",
472 | "APW19981025.0234"
473 | ],
474 | "models": [
475 | "D31027.M.100.T.A",
476 | "D31027.M.100.T.B",
477 | "D31027.M.100.T.C",
478 | "D31027.M.100.T.D"
479 | ]
480 | },
481 | "D31028": {
482 | "inputs": [
483 | "NYT19981001.0271",
484 | "APW19981003.0141",
485 | "APW19981003.0142",
486 | "NYT19981004.0072",
487 | "APW19981004.0574",
488 | "APW19981005.1108",
489 | "NYT19981020.0178",
490 | "NYT19981026.0341",
491 | "APW19981031.0317",
492 | "NYT19981107.0072"
493 | ],
494 | "models": [
495 | "D31028.M.100.T.A",
496 | "D31028.M.100.T.B",
497 | "D31028.M.100.T.C",
498 | "D31028.M.100.T.J"
499 | ]
500 | },
501 | "D31031": {
502 | "inputs": [
503 | "NYT19981005.0441",
504 | "NYT19981006.0047",
505 | "NYT19981006.0127",
506 | "NYT19981006.0391",
507 | "NYT19981006.0397",
508 | "NYT19981007.0399",
509 | "NYT19981008.0412",
510 | "NYT19981009.0452",
511 | "NYT19981009.0476",
512 | "NYT19981011.0181"
513 | ],
514 | "models": [
515 | "D31031.M.100.T.E",
516 | "D31031.M.100.T.F",
517 | "D31031.M.100.T.G",
518 | "D31031.M.100.T.H"
519 | ]
520 | },
521 | "D31033": {
522 | "inputs": [
523 | "NYT19981018.0102",
524 | "NYT19981019.0284",
525 | "NYT19981019.0476",
526 | "NYT19981020.0315",
527 | "NYT19981020.0345",
528 | "NYT19981021.0064",
529 | "NYT19981021.0066",
530 | "NYT19981021.0400",
531 | "NYT19981022.0507",
532 | "NYT19981023.0251"
533 | ],
534 | "models": [
535 | "D31033.M.100.T.D",
536 | "D31033.M.100.T.E",
537 | "D31033.M.100.T.F",
538 | "D31033.M.100.T.G"
539 | ]
540 | },
541 | "D31038": {
542 | "inputs": [
543 | "APW19981002.0556",
544 | "NYT19981003.0093",
545 | "APW19981003.0517",
546 | "NYT19981005.0386",
547 | "NYT19981005.0454",
548 | "NYT19981007.0383",
549 | "NYT19981016.0283",
550 | "NYT19981017.0140",
551 | "NYT19981021.0378",
552 | "NYT19981022.0453"
553 | ],
554 | "models": [
555 | "D31038.M.100.T.A",
556 | "D31038.M.100.T.B",
557 | "D31038.M.100.T.I",
558 | "D31038.M.100.T.J"
559 | ]
560 | },
561 | "D31041": {
562 | "inputs": [
563 | "APW19981003.0184",
564 | "APW19981010.0163",
565 | "APW19981110.0245",
566 | "APW19981122.0379",
567 | "APW19981123.0257",
568 | "APW19981124.0261",
569 | "APW19981202.0265",
570 | "APW19981203.0309",
571 | "NYT19981218.0224"
572 | ],
573 | "models": [
574 | "D31041.M.100.T.A",
575 | "D31041.M.100.T.H",
576 | "D31041.M.100.T.I",
577 | "D31041.M.100.T.J"
578 | ]
579 | },
580 | "D31050": {
581 | "inputs": [
582 | "NYT19981202.0309",
583 | "APW19981202.1274",
584 | "APW19981203.0338",
585 | "NYT19981207.0280",
586 | "NYT19981209.0542",
587 | "NYT19981216.0357",
588 | "APW19981216.0666",
589 | "NYT19981217.0274",
590 | "NYT19981218.0250",
591 | "APW19981220.0155"
592 | ],
593 | "models": [
594 | "D31050.M.100.T.C",
595 | "D31050.M.100.T.D",
596 | "D31050.M.100.T.E",
597 | "D31050.M.100.T.F"
598 | ]
599 | }
600 | }
601 |
--------------------------------------------------------------------------------
/sumpy/util.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import tarfile
4 | import re
5 | from datetime import datetime
6 | import corenlp as cnlp
7 | import json
8 | import pkg_resources
9 |
10 | class DUCHelper(object):
11 |
12 | def __init__(self, duc_path=None, sumpy_data_path=None):
13 | if duc_path is None:
14 | duc_path = os.getenv("DUC_DATA", "~/DUC")
15 | self.duc_path = duc_path
16 | if sumpy_data_path is None:
17 | self.sumpy_data_path = os.getenv("SUMPY_DATA",
18 | os.path.join(
19 | os.path.expanduser("~"), ".sumpy"))
20 |
21 | def docset_iter(self, year, task):
22 |
23 | if year == 2003:
24 | if task == 2:
25 | duc_json_path = pkg_resources.resource_filename(
26 | "sumpy", os.path.join("data", "duc03_task2.json"))
27 | with open(duc_json_path, "r") as f:
28 | docsets = json.load(f, strict=False)
29 |
30 | docset_ids = sorted(docsets.keys())
31 | for docset_id in docset_ids:
32 | ds = DUCDocset(
33 | docset_id, 2003, 2,
34 | docsets[docset_id]["inputs"],
35 | os.path.join(
36 | self.sumpy_data_path, "duc2003", "task2",
37 | docset_id, "inputs"),
38 | docsets[docset_id]["models"],
39 | os.path.join(
40 | self.sumpy_data_path, "duc2003", "task2",
41 | docset_id, "models"))
42 |
43 | yield ds
44 |
45 |
46 | elif year == 2004:
47 | if task == 2:
48 | duc_json_path = pkg_resources.resource_filename(
49 | "sumpy", os.path.join("data", "duc04_task2.json"))
50 | with open(duc_json_path, "r") as f:
51 | docsets = json.load(f, strict=False)
52 |
53 | docset_ids = sorted(docsets.keys())
54 | for docset_id in docset_ids:
55 | ds = DUCDocset(
56 | docset_id, 2004, 2,
57 | docsets[docset_id]["inputs"],
58 | os.path.join(
59 | self.sumpy_data_path, "duc2004", "task2",
60 | docset_id, "inputs"),
61 | docsets[docset_id]["models"],
62 | os.path.join(
63 | self.sumpy_data_path, "duc2004", "task2",
64 | docset_id, "models"))
65 |
66 | yield ds
67 |
68 | # elif year == 2007:
69 | # if task == 2:
70 | # for docset_id in self.duc07_task2_docset_ids:
71 | # dsA = DUCDocset(
72 | # docset_id, 2007, 2,
73 | # self.duc07_task2[docset_id]["A"]["inputs"],
74 | # os.path.join(self.duc07_task2_docsets_path,
75 | # "{}-A".format(docset_id)),
76 | # self.duc07_task2[docset_id]["A"]["models"],
77 | # os.path.join(self.duc07_task2_models_path))
78 | # dsB = DUCDocset(
79 | # docset_id, 2007, 2,
80 | # self.duc07_task2[docset_id]["B"]["inputs"],
81 | # os.path.join(self.duc07_task2_docsets_path,
82 | # "{}-B".format(docset_id)),
83 | # self.duc07_task2[docset_id]["B"]["models"],
84 | # os.path.join(self.duc07_task2_models_path))
85 | # dsC = DUCDocset(
86 | # docset_id, 2007, 2,
87 | # self.duc07_task2[docset_id]["C"]["inputs"],
88 | # os.path.join(self.duc07_task2_docsets_path,
89 | # "{}-C".format(docset_id)),
90 | # self.duc07_task2[docset_id]["C"]["models"],
91 | # os.path.join(self.duc07_task2_models_path))
92 | #
93 | # ds = DUCUpdateDocset(
94 | # docset_id, year, task, [dsA, dsB, dsC])
95 | # yield ds
96 |
97 | else:
98 | raise Exception("Bad argument: year is {}".format(year))
99 |
100 | def docsets(self, year, task):
101 | if year == 2003:
102 | if task == 2:
103 | return DUCDocsets([ds for ds in self.docset_iter(2003, 2)])
104 | else:
105 | raise Exception("Bad argument: task is {}".format(task))
106 | elif year == 2004:
107 | if task == 2:
108 | return DUCDocsets([ds for ds in self.docset_iter(2004, 2)])
109 | else:
110 | raise Exception("Bad argument: task is {}".format(task))
111 | else:
112 | raise Exception("Bad argument: year is {}".format(year))
113 |
114 | def install(self, year, task):
115 | if year == 2001:
116 | raise Exception("Not implemented!")
117 | elif year == 2002:
118 | raise Exception("Not Implemented!")
119 | elif year == 2003:
120 | self._install_duc03_task2()
121 | elif year == 2004:
122 | self._install_duc04_task2()
123 | else:
124 | raise Exception("Not implemented!")
125 |
126 | def _install_duc03_task2(self):
127 | data_path = os.path.join(self.sumpy_data_path, "duc2003", "task2")
128 | if not os.path.exists(data_path):
129 | os.makedirs(data_path)
130 | data_path_duc = os.path.join(
131 | self.duc_path, "DUC2003_Summarization_Documents.tgz")
132 | data_path_models = os.path.join(
133 | self.duc_path, "detagged.duc2003.abstracts.tar.gz")
134 |
135 | if not os.path.exists(data_path_duc):
136 | raise Exception("{} does not exist. " \
137 | "Please obtain this file from NIST.".format(
138 | data_path_duc))
139 | if not os.path.exists(data_path_models):
140 | raise Exception("{} does not exist. " \
141 | "Please obtain this file from NIST.".format(
142 | data_path_models))
143 |
144 |
145 | docsets = {}
146 |
147 | docs_tar = os.path.join("DUC2003_Summarization_Documents",
148 | "duc2003_testdata", "task2", "task2.docs.tar.gz")
149 | with tarfile.open(name=data_path_duc, mode="r") as tf:
150 | for m in tf.getmembers():
151 | if m.name == docs_tar:
152 | break
153 |
154 | f = tf.extractfile(m)
155 | from StringIO import StringIO
156 | b = StringIO(f.read())
157 | with tarfile.open(fileobj=b, mode="r") as dtf:
158 | for m in dtf.getmembers():
159 | path, doc_id = os.path.split(m.name)
160 | _, docset_id = os.path.split(path)
161 | text = dtf.extractfile(m).read()
162 | docset_id = docset_id.upper()[:-1]
163 | docset = docsets.get(
164 | docset_id, {"inputs": [], "models": []})
165 | docset["inputs"].append({"input id": doc_id, "text": text})
166 | docsets[docset_id] = docset
167 | with tarfile.open(name=data_path_models, mode="r") as tf:
168 | for m in tf.getmembers():
169 | path, model = os.path.split(m.name)
170 | if os.path.split(path)[1] == "models":
171 | if re.search(r'D\d{5}\.\w\.100\.\w\.\w.html', model):
172 | docset_id = model.split(".")[0]
173 | model_id = os.path.splitext(model)[0]
174 | text = tf.extractfile(m).read()
175 | docsets[docset_id]["models"].append(
176 | {"model id": model_id,
177 | "text": text})
178 |
179 | #annotators=["tokenize", "ssplit"]
180 | #with cnlp.Server(annotators=annotators) as pipeline:
181 | for docset_id, docset in docsets.items():
182 | inputs_path = os.path.join(data_path, docset_id, "inputs")
183 | if not os.path.exists(inputs_path):
184 | os.makedirs(inputs_path)
185 | for input in docset["inputs"]:
186 | input_path = os.path.join(inputs_path, input["input id"])
187 | with open(input_path, "wb") as f:
188 | f.write(input["text"])
189 |
190 | models_path = os.path.join(data_path, docset_id, "models")
191 | if not os.path.exists(models_path):
192 | os.makedirs(models_path)
193 | for model in docset["models"]:
194 | model_path = os.path.join(models_path, model["model id"])
195 | #doc = pipeline.annotate(model["text"])
196 |
197 | with open(model_path, "wb") as f:
198 | f.write(model["text"])
199 | #for sent in doc:
200 | # line = " ".join([str(tok) for tok in sent]) + "\n"
201 | # f.write(line)
202 |
203 |
204 | def _install_duc04_task2(self):
205 | data_path = os.path.join(self.sumpy_data_path, "duc2004", "task2")
206 | if not os.path.exists(data_path):
207 | os.makedirs(data_path)
208 | data_path_duc = os.path.join(
209 | self.duc_path, "DUC2004_Summarization_Documents.tgz")
210 | data_path_models = os.path.join(
211 | self.duc_path, "duc2004_results.tgz")
212 |
213 | if not os.path.exists(data_path_duc):
214 | raise Exception("{} does not exist. " \
215 | "Please obtain this file from NIST.".format(
216 | data_path_duc))
217 | if not os.path.exists(data_path_models):
218 | raise Exception("{} does not exist. " \
219 | "Please obtain this file from NIST.".format(
220 | data_path_models))
221 |
222 | docsets = {}
223 | tgt_path = os.path.join("DUC2004_Summarization_Documents",
224 | "duc2004_testdata", "tasks1and2", "duc2004_tasks1and2_docs",
225 | "docs")
226 | with tarfile.open(name=data_path_duc, mode="r") as tf:
227 | for m in tf.getmembers():
228 | path, doc_id = os.path.split(m.name)
229 | path, docset_id = os.path.split(path)
230 | if path == tgt_path:
231 | docset_id = docset_id.upper()[:-1]
232 | text = tf.extractfile(m).read()
233 | docset = docsets.get(
234 | docset_id, {"inputs": [], "models": []})
235 | docset["inputs"].append({"input id": doc_id, "text": text})
236 | docsets[docset_id] = docset
237 | tgt_path = os.path.join("duc2004_results", "ROUGE",
238 | "duc2004.task2.ROUGE.models.tar.gz")
239 | with tarfile.open(name=data_path_models, mode="r") as tf:
240 | for m in tf.getmembers():
241 | if m.name == tgt_path:
242 | break
243 | models_tar = tf.extractfile(m)
244 | with tarfile.open(fileobj=models_tar, mode="r") as mtf:
245 | for m in mtf.getmembers():
246 | model_id = os.path.split(m.name)[1]
247 | docset_id = model_id.split(".")[0]
248 | text = mtf.extractfile(m).read()
249 | docsets[docset_id]["models"].append(
250 | {"model id": model_id,
251 | "text": text})
252 |
253 | #annotators=["tokenize", "ssplit"]
254 | #with cnlp.Server(annotators=annotators) as pipeline:
255 | for docset_id, docset in docsets.items():
256 | inputs_path = os.path.join(data_path, docset_id, "inputs")
257 | if not os.path.exists(inputs_path):
258 | os.makedirs(inputs_path)
259 | for input in docset["inputs"]:
260 | input_path = os.path.join(inputs_path, input["input id"])
261 | with open(input_path, "wb") as f:
262 | f.write(input["text"])
263 |
264 | models_path = os.path.join(data_path, docset_id, "models")
265 | if not os.path.exists(models_path):
266 | os.makedirs(models_path)
267 | for model in docset["models"]:
268 | model_path = os.path.join(models_path, model["model id"])
269 |
270 | #doc = pipeline.annotate(model["text"])
271 |
272 | with open(model_path, "wb") as f:
273 | f.write(model["text"])
274 | #for sent in doc:
275 | # line = " ".join([str(tok) for tok in sent]) + "\n"
276 | # f.write(line)
277 |
278 |
279 |
280 |
281 | # def _install_duc01_task2(self):
282 | #
283 | # data_path = os.path.join(self.sumpy_data_path, "duc2001", "task2")
284 | # if not os.path.exists(data_path):
285 | # os.makedirs(data_path)
286 | # data_path_duc = os.path.join(
287 | # self.duc_path, "DUC2001_Summarization_Documents.tgz")
288 | #
289 | # if not os.path.exists(data_path_duc):
290 | # raise Exception("{} does not exist. " \
291 | # "Please obtain this file from NIST.".format(
292 | # data_path_duc))
293 | #
294 | # docments_tar_path = os.path.join("DUC2001_Summarization_Documents",
295 | # "data", "testtraining", "Duc2001testtraining.tar.gz")
296 | #
297 | # with tarfile.open(name=data_path_duc, mode="r") as tf:
298 | # mem_documents_tar = [m for m in tf.getmembers()
299 | # if m.name == docments_tar_path]
300 | # tf.extractall(members=mem_documents_tar)
301 | # documents_tar_path = os.path.join(
302 | # "DUC2001_Summarization_Documents", "data", "testtraining",
303 | # "Duc2001testtraining.tar.gz")
304 | #
305 | # if not os.path.exists(documents_tar_path):
306 | # raise Exception("Failed to extract DUC 2001 documents!")
307 | #
308 | # with tarfile.open(docments_tar_path, mode="r") as tf:
309 | # tf.extractall()
310 | #
311 | # documents_path = "duc2002testtraining"
312 | # if not os.path.exists(documents_path):
313 | # raise Exception("Failed to extract DUC 2001 documents!")
314 | #
315 | # docsets = {}
316 | # for docset_id in os.listdir(documents_path):
317 | # docset_path = os.path.join(documents_path, docset_id)
318 | # articles = []
319 | # for article_name in os.listdir(docset_path):
320 | # if article_name.startswith("ap"):
321 | # year = 1900 + int(article_name[2:4])
322 | # month = int(article_name[4:6])
323 | # day = int(article_name[6:8])
324 | # ts = datetime(year, month, day)
325 | # elif article_name.startswith("wsj"):
326 | # year = 1900 + int(article_name[3:5])
327 | # month = int(article_name[5:7])
328 | # day = int(article_name[7:9])
329 | # ts = datetime(year, month, day)
330 | # elif article_name.startswith("la"):
331 | # year = 1900 + int(article_name[6:8])
332 | # month = int(article_name[2:4])
333 | # day = int(article_name[4:6])
334 | # ts = datetime(year, month, day)
335 | # elif article_name.startswith("ft"):
336 | # year = 1900 + int(article_name[2:4])
337 | # month = int(article_name.split("-")[0][4:])
338 | # ts = datetime(year, month, 1)
339 | # elif article_name.startswith("fbis"):
340 | # ts = datetime(1977,1,1)
341 | # elif article_name.startswith("sjmn"):
342 | # ts = datetime(91,1,1)
343 | # else:
344 | # raise Exception("Found unsual file here {}".format(
345 | # article_name))
346 | # print article_name, ts
347 | # article_path = os.path.join(
348 | # docset_path, article_name, "{}.body".format(article_name))
349 | # with open(article_path, "rb") as f:
350 | # content = f.read()
351 | # articles.append({"input id": article_name,
352 | # "raw text": content,
353 | # "timestamp": ts})
354 | # docsets[docset_id] = articles
355 | #
356 | # shutil.rmtree("DUC2001_Summarization_Documents")
357 | # shutil.rmtree(documents_path)
358 |
359 | class DUCDocsets(object):
360 | def __init__(self, docsets):
361 | self._docsets = {ds.docset_id: ds for ds in docsets}
362 |
363 | def __getitem__(self, ds_id):
364 | return self._docsets[ds_id]
365 |
366 | class DUCDocset(object):
367 | def __init__(self, docset_id, year, task, inputs, input_root,
368 | models, model_root):
369 | self.docset_id = docset_id
370 | self.year = year
371 | self.task = task
372 | self.inputs = inputs
373 | self.input_root = input_root
374 | self.models = models
375 | self.model_root = model_root
376 |
377 | def __str__(self):
378 | return "DUCDocset({}, {}, {}, {} inputs, {}, {} models, {})".format(
379 | self.docset_id, self.year, self.task, len(self.inputs),
380 | self.input_root[:10] + "...", len(self.models),
381 | self.model_root[:10] + "...")
382 |
383 | def input_iter(self):
384 | for doc_id in self.inputs:
385 | timestamp_t = int(doc_id[3:7]), int(doc_id[7:9]), int(doc_id[9:11])
386 | timestamp = datetime(*timestamp_t)
387 |
388 | yield DUCDocument(
389 | doc_id, timestamp, os.path.join(self.input_root, doc_id))
390 |
391 | def model_iter(self):
392 | for doc_id in self.models:
393 | yield DUCModel(doc_id, os.path.join(self.model_root, doc_id))
394 |
395 | class DUCUpdateDocset(object):
396 | def __init__(self, docset_id, year, task, docsets):
397 | self.docset_id = docset_id
398 | self.year = year
399 | self.task = task
400 | self.docsets = docsets
401 |
402 | def update_iter(self):
403 | for update_ds in self.docsets:
404 | yield update_ds
405 |
406 | class DUCDocument(object):
407 | def __init__(self, doc_id, timestamp, path):
408 | self.doc_id = doc_id
409 | self.timestamp = timestamp
410 | self.path = path
411 | self._text = None
412 |
413 | def _read(self):
414 | if os.path.exists(self.path):
415 | with open(self.path, "rb") as f:
416 | self._text = f.read()
417 | else:
418 | raise Exception("DUCDocument {} not found at path {}".format(
419 | self.doc_id, self.path))
420 |
421 | def __str__(self):
422 | if self._text is None:
423 | self._read()
424 | return self._text
425 |
426 | def __unicode__(self):
427 | if self._text is None:
428 | self._read()
429 | return self._text.decode("utf-8")
430 |
431 | def __bytes__(self):
432 | if self._text is None:
433 | self._read()
434 | return self._text
435 |
436 | class DUCModel(object):
437 | def __init__(self, doc_id, path):
438 | self.doc_id = doc_id
439 | self.path = path
440 | self._text = None
441 |
442 | def _read(self):
443 | if os.path.exists(self.path):
444 | with open(self.path, "rb") as f:
445 | self._text = f.read()
446 | else:
447 | raise Exception("DUCModel {} not found at path {}".format(
448 | self.doc_id, self.path))
449 |
450 | def __str__(self):
451 | if self._text is None:
452 | self._read()
453 | return self._text
454 |
455 | def __unicode__(self):
456 | if self._text is None:
457 | self._read()
458 | return self._text.decode("utf-8")
459 |
460 | def __bytes__(self):
461 | if self._text is None:
462 | self._read()
463 | return self._text
464 |
--------------------------------------------------------------------------------
/sumpy/data/duc04_task2.json:
--------------------------------------------------------------------------------
1 | {
2 | "D30001": {
3 | "inputs": [
4 | "APW19981016.0240",
5 | "APW19981022.0269",
6 | "APW19981026.0220",
7 | "APW19981027.0491",
8 | "APW19981031.0167",
9 | "APW19981113.0251",
10 | "APW19981116.0205",
11 | "APW19981118.0276",
12 | "APW19981120.0274",
13 | "APW19981124.0267"
14 | ],
15 | "models": [
16 | "D30001.M.100.T.A",
17 | "D30001.M.100.T.B",
18 | "D30001.M.100.T.C",
19 | "D30001.M.100.T.D"
20 | ]
21 | },
22 | "D30002": {
23 | "inputs": [
24 | "APW19981027.0241",
25 | "APW19981028.1120",
26 | "APW19981029.0570",
27 | "APW19981031.0720",
28 | "APW19981101.0843",
29 | "APW19981102.0737",
30 | "APW19981103.0526",
31 | "APW19981104.0539",
32 | "APW19981105.1220",
33 | "APW19981106.0869"
34 | ],
35 | "models": [
36 | "D30002.M.100.T.A",
37 | "D30002.M.100.T.B",
38 | "D30002.M.100.T.C",
39 | "D30002.M.100.T.E"
40 | ]
41 | },
42 | "D30003": {
43 | "inputs": [
44 | "APW19981018.0423",
45 | "APW19981019.0098",
46 | "APW19981020.0241",
47 | "APW19981021.0557",
48 | "APW19981022.1132",
49 | "APW19981023.1166",
50 | "APW19981024.0192",
51 | "APW19981025.0449",
52 | "NYT19981026.0292",
53 | "APW19981028.0444"
54 | ],
55 | "models": [
56 | "D30003.M.100.T.A",
57 | "D30003.M.100.T.B",
58 | "D30003.M.100.T.C",
59 | "D30003.M.100.T.F"
60 | ]
61 | },
62 | "D30005": {
63 | "inputs": [
64 | "NYT19981003.0093",
65 | "APW19981003.0517",
66 | "APW19981111.0288",
67 | "APW19981112.0551",
68 | "NYT19981113.0410",
69 | "APW19981119.0552",
70 | "APW19981120.0887",
71 | "APW19981129.0665",
72 | "NYT19981201.0444",
73 | "NYT19981202.0428"
74 | ],
75 | "models": [
76 | "D30005.M.100.T.A",
77 | "D30005.M.100.T.B",
78 | "D30005.M.100.T.C",
79 | "D30005.M.100.T.G"
80 | ]
81 | },
82 | "D30006": {
83 | "inputs": [
84 | "NYT19981003.0083",
85 | "NYT19981005.0385",
86 | "NYT19981006.0396",
87 | "NYT19981008.0461",
88 | "NYT19981013.0354",
89 | "NYT19981014.0003",
90 | "NYT19981018.0175",
91 | "APW19981018.0836",
92 | "APW19981018.0888",
93 | "NYT19981021.0014"
94 | ],
95 | "models": [
96 | "D30006.M.100.T.A",
97 | "D30006.M.100.T.B",
98 | "D30006.M.100.T.C",
99 | "D30006.M.100.T.H"
100 | ]
101 | },
102 | "D30007": {
103 | "inputs": [
104 | "APW19981001.0312",
105 | "APW19981002.0522",
106 | "APW19981002.0567",
107 | "APW19981004.0851",
108 | "APW19981006.0556",
109 | "APW19981007.0574",
110 | "APW19981010.0696",
111 | "APW19981011.0515",
112 | "APW19981011.0744",
113 | "APW19981013.0275"
114 | ],
115 | "models": [
116 | "D30007.M.100.T.A",
117 | "D30007.M.100.T.B",
118 | "D30007.M.100.T.D",
119 | "D30007.M.100.T.E"
120 | ]
121 | },
122 | "D30008": {
123 | "inputs": [
124 | "APW19981004.0281",
125 | "APW19981006.0251",
126 | "APW19981013.0853",
127 | "APW19981016.0437",
128 | "APW19981019.0104",
129 | "APW19981023.0281",
130 | "NYT19981105.0538",
131 | "APW19981108.0837",
132 | "APW19981109.0264",
133 | "APW19981109.0274"
134 | ],
135 | "models": [
136 | "D30008.M.100.T.A",
137 | "D30008.M.100.T.B",
138 | "D30008.M.100.T.D",
139 | "D30008.M.100.T.G"
140 | ]
141 | },
142 | "D30010": {
143 | "inputs": [
144 | "APW19981106.0273",
145 | "APW19981106.0274",
146 | "APW19981106.0275",
147 | "APW19981106.0276",
148 | "NYT19981107.0251",
149 | "APW19981107.0568",
150 | "APW19981107.0744",
151 | "APW19981107.0752",
152 | "NYT19981108.0136",
153 | "APW19981108.0188"
154 | ],
155 | "models": [
156 | "D30010.M.100.T.A",
157 | "D30010.M.100.T.B",
158 | "D30010.M.100.T.D",
159 | "D30010.M.100.T.H"
160 | ]
161 | },
162 | "D30011": {
163 | "inputs": [
164 | "APW19981001.0315",
165 | "APW19981002.0550",
166 | "APW19981002.1081",
167 | "NYT19981003.0120",
168 | "APW19981003.0144",
169 | "APW19981004.0281",
170 | "APW19981005.0484",
171 | "APW19981006.0251",
172 | "APW19981008.0259",
173 | "APW19981008.0527"
174 | ],
175 | "models": [
176 | "D30011.M.100.T.A",
177 | "D30011.M.100.T.B",
178 | "D30011.M.100.T.E",
179 | "D30011.M.100.T.F"
180 | ]
181 | },
182 | "D30015": {
183 | "inputs": [
184 | "NYT19981004.0102",
185 | "APW19981005.0205",
186 | "APW19981005.0223",
187 | "APW19981005.0233",
188 | "NYT19981005.0391",
189 | "APW19981005.0496",
190 | "APW19981005.0506",
191 | "APW19981005.0762",
192 | "APW19981005.1072",
193 | "APW19981005.1082"
194 | ],
195 | "models": [
196 | "D30015.M.100.T.A",
197 | "D30015.M.100.T.B",
198 | "D30015.M.100.T.E",
199 | "D30015.M.100.T.H"
200 | ]
201 | },
202 | "D30017": {
203 | "inputs": [
204 | "APW19981010.0187",
205 | "APW19981022.0488",
206 | "APW19981104.0245",
207 | "APW19981110.0240",
208 | "NYT19981114.0099",
209 | "APW19981118.0898",
210 | "APW19981119.0262",
211 | "APW19981124.0251",
212 | "NYT19981209.0451",
213 | "APW19981221.0189"
214 | ],
215 | "models": [
216 | "D30017.M.100.T.A",
217 | "D30017.M.100.T.B",
218 | "D30017.M.100.T.F",
219 | "D30017.M.100.T.G"
220 | ]
221 | },
222 | "D30020": {
223 | "inputs": [
224 | "APW19981002.0557",
225 | "APW19981029.0521",
226 | "APW19981030.1074",
227 | "APW19981102.0220",
228 | "APW19981126.0432",
229 | "APW19981126.0450",
230 | "APW19981128.0168",
231 | "APW19981206.0174",
232 | "APW19981206.0199",
233 | "APW19981206.0379"
234 | ],
235 | "models": [
236 | "D30020.M.100.T.A",
237 | "D30020.M.100.T.C",
238 | "D30020.M.100.T.D",
239 | "D30020.M.100.T.E"
240 | ]
241 | },
242 | "D30022": {
243 | "inputs": [
244 | "APW19981005.0231",
245 | "APW19981015.0177",
246 | "NYT19981202.0309",
247 | "APW19981202.1274",
248 | "APW19981203.0338",
249 | "NYT19981207.0280",
250 | "NYT19981209.0542",
251 | "NYT19981216.0357",
252 | "APW19981216.0666",
253 | "NYT19981217.0274"
254 | ],
255 | "models": [
256 | "D30022.M.100.T.A",
257 | "D30022.M.100.T.C",
258 | "D30022.M.100.T.D",
259 | "D30022.M.100.T.F"
260 | ]
261 | },
262 | "D30024": {
263 | "inputs": [
264 | "NYT19981007.0464",
265 | "NYT19981104.0369",
266 | "NYT19981104.0516",
267 | "NYT19981104.0545",
268 | "NYT19981104.0597",
269 | "NYT19981104.0600",
270 | "NYT19981105.0439",
271 | "NYT19981105.0509",
272 | "NYT19981105.0525",
273 | "NYT19981106.0464"
274 | ],
275 | "models": [
276 | "D30024.M.100.T.A",
277 | "D30024.M.100.T.C",
278 | "D30024.M.100.T.D",
279 | "D30024.M.100.T.G"
280 | ]
281 | },
282 | "D30026": {
283 | "inputs": [
284 | "NYT19981101.0082",
285 | "NYT19981122.0131",
286 | "NYT19981122.0163",
287 | "NYT19981123.0453",
288 | "NYT19981123.0458",
289 | "NYT19981123.0478",
290 | "NYT19981124.0340",
291 | "NYT19981124.0365",
292 | "NYT19981124.0411",
293 | "NYT19981125.0073"
294 | ],
295 | "models": [
296 | "D30026.M.100.T.A",
297 | "D30026.M.100.T.C",
298 | "D30026.M.100.T.D",
299 | "D30026.M.100.T.H"
300 | ]
301 | },
302 | "D30027": {
303 | "inputs": [
304 | "NYT19981001.0363",
305 | "NYT19981001.0379",
306 | "APW19981001.1177",
307 | "NYT19981002.0250",
308 | "NYT19981002.0300",
309 | "APW19981002.0778",
310 | "APW19981002.0783",
311 | "APW19981002.0809",
312 | "APW19981003.0292",
313 | "NYT19981004.0132"
314 | ],
315 | "models": [
316 | "D30027.M.100.T.A",
317 | "D30027.M.100.T.C",
318 | "D30027.M.100.T.E",
319 | "D30027.M.100.T.G"
320 | ]
321 | },
322 | "D30028": {
323 | "inputs": [
324 | "APW19981001.0539",
325 | "APW19981004.0182",
326 | "APW19981004.0296",
327 | "APW19981004.0321",
328 | "APW19981004.0550",
329 | "APW19981005.0236",
330 | "APW19981005.0457",
331 | "APW19981005.0467",
332 | "APW19981005.0474",
333 | "APW19981005.1033"
334 | ],
335 | "models": [
336 | "D30028.M.100.T.A",
337 | "D30028.M.100.T.C",
338 | "D30028.M.100.T.F",
339 | "D30028.M.100.T.G"
340 | ]
341 | },
342 | "D30029": {
343 | "inputs": [
344 | "APW19981224.0814",
345 | "APW19981226.0185",
346 | "APW19981227.0319",
347 | "APW19981227.0766",
348 | "APW19981227.0803",
349 | "APW19981227.0836",
350 | "APW19981227.0840",
351 | "APW19981227.0853",
352 | "APW19981227.0870",
353 | "APW19981228.0467"
354 | ],
355 | "models": [
356 | "D30029.M.100.T.A",
357 | "D30029.M.100.T.C",
358 | "D30029.M.100.T.F",
359 | "D30029.M.100.T.H"
360 | ]
361 | },
362 | "D30031": {
363 | "inputs": [
364 | "NYT19981203.0460",
365 | "NYT19981204.0365",
366 | "NYT19981206.0178",
367 | "APW19981207.0418",
368 | "APW19981207.0577",
369 | "APW19981207.0578",
370 | "APW19981207.0580",
371 | "APW19981207.0581",
372 | "APW19981207.0583",
373 | "NYT19981113.0404"
374 | ],
375 | "models": [
376 | "D30031.M.100.T.A",
377 | "D30031.M.100.T.D",
378 | "D30031.M.100.T.E",
379 | "D30031.M.100.T.F"
380 | ]
381 | },
382 | "D30033": {
383 | "inputs": [
384 | "NYT19981119.0380",
385 | "APW19981127.0244",
386 | "APW19981203.0649",
387 | "APW19981203.1240",
388 | "NYT19981223.0347",
389 | "APW19981228.0189",
390 | "APW19981229.0467",
391 | "APW19981230.0431",
392 | "APW19981230.0473",
393 | "APW19981231.0143"
394 | ],
395 | "models": [
396 | "D30033.M.100.T.A",
397 | "D30033.M.100.T.D",
398 | "D30033.M.100.T.E",
399 | "D30033.M.100.T.G"
400 | ]
401 | },
402 | "D30034": {
403 | "inputs": [
404 | "APW19981124.0554",
405 | "APW19981126.0443",
406 | "APW19981130.0497",
407 | "APW19981205.0792",
408 | "APW19981205.0807",
409 | "APW19981211.0972",
410 | "APW19981211.0982",
411 | "APW19981211.0990",
412 | "APW19981212.0541",
413 | "APW19981221.0448"
414 | ],
415 | "models": [
416 | "D30034.M.100.T.A",
417 | "D30034.M.100.T.D",
418 | "D30034.M.100.T.F",
419 | "D30034.M.100.T.G"
420 | ]
421 | },
422 | "D30036": {
423 | "inputs": [
424 | "APW19981006.0833",
425 | "APW19981007.0823",
426 | "NYT19981008.0338",
427 | "APW19981008.0523",
428 | "APW19981008.1113",
429 | "APW19981009.0788",
430 | "APW19981011.0541",
431 | "APW19981012.0252",
432 | "APW19981012.0267",
433 | "NYT19981012.0334"
434 | ],
435 | "models": [
436 | "D30036.M.100.T.A",
437 | "D30036.M.100.T.D",
438 | "D30036.M.100.T.F",
439 | "D30036.M.100.T.H"
440 | ]
441 | },
442 | "D30037": {
443 | "inputs": [
444 | "APW19981004.0138",
445 | "NYT19981009.0337",
446 | "NYT19981009.0369",
447 | "NYT19981009.0486",
448 | "APW19981009.1040",
449 | "APW19981010.0164",
450 | "APW19981010.0173",
451 | "APW19981010.0374",
452 | "APW19981010.0383",
453 | "APW19981012.0254"
454 | ],
455 | "models": [
456 | "D30037.M.100.T.A",
457 | "D30037.M.100.T.D",
458 | "D30037.M.100.T.G",
459 | "D30037.M.100.T.H"
460 | ]
461 | },
462 | "D30038": {
463 | "inputs": [
464 | "APW19981211.0352",
465 | "APW19981211.1276",
466 | "APW19981211.1288",
467 | "APW19981212.0354",
468 | "APW19981212.0562",
469 | "NYT19981213.0205",
470 | "APW19981213.0396",
471 | "APW19981213.0412",
472 | "APW19981213.0424",
473 | "APW19981213.0720"
474 | ],
475 | "models": [
476 | "D30038.M.100.T.A",
477 | "D30038.M.100.T.E",
478 | "D30038.M.100.T.F",
479 | "D30038.M.100.T.H"
480 | ]
481 | },
482 | "D30040": {
483 | "inputs": [
484 | "APW19981119.1180",
485 | "APW19981119.1227",
486 | "APW19981123.1179",
487 | "APW19981124.0254",
488 | "APW19981124.0256",
489 | "APW19981205.0220",
490 | "APW19981229.0756",
491 | "APW19981229.0763",
492 | "APW19981230.0983",
493 | "APW19981230.0991"
494 | ],
495 | "models": [
496 | "D30040.M.100.T.A",
497 | "D30040.M.100.T.E",
498 | "D30040.M.100.T.G",
499 | "D30040.M.100.T.H"
500 | ]
501 | },
502 | "D30042": {
503 | "inputs": [
504 | "APW19981019.0307",
505 | "APW19981129.0668",
506 | "APW19981202.1230",
507 | "APW19981205.0172",
508 | "APW19981205.0213",
509 | "APW19981206.0364",
510 | "APW19981206.0371",
511 | "APW19981209.1444",
512 | "APW19981212.0848",
513 | "APW19981221.1004"
514 | ],
515 | "models": [
516 | "D30042.M.100.T.B",
517 | "D30042.M.100.T.C",
518 | "D30042.M.100.T.D",
519 | "D30042.M.100.T.F"
520 | ]
521 | },
522 | "D30044": {
523 | "inputs": [
524 | "APW19981114.0575",
525 | "APW19981115.0371",
526 | "APW19981115.0618",
527 | "APW19981115.0626",
528 | "APW19981116.0221",
529 | "APW19981116.0235",
530 | "NYT19981116.0479",
531 | "APW19981116.1120",
532 | "APW19981117.0528",
533 | "APW19981117.0530"
534 | ],
535 | "models": [
536 | "D30044.M.100.T.B",
537 | "D30044.M.100.T.C",
538 | "D30044.M.100.T.D",
539 | "D30044.M.100.T.G"
540 | ]
541 | },
542 | "D30045": {
543 | "inputs": [
544 | "NYT19981125.0417",
545 | "NYT19981125.0433",
546 | "NYT19981126.0192",
547 | "NYT19981127.0203",
548 | "NYT19981127.0240",
549 | "NYT19981127.0256",
550 | "NYT19981127.0264",
551 | "NYT19981127.0289",
552 | "NYT19981127.0293",
553 | "NYT19981129.0113"
554 | ],
555 | "models": [
556 | "D30045.M.100.T.B",
557 | "D30045.M.100.T.C",
558 | "D30045.M.100.T.E",
559 | "D30045.M.100.T.F"
560 | ]
561 | },
562 | "D30046": {
563 | "inputs": [
564 | "NYT19981217.0394",
565 | "NYT19981218.0380",
566 | "NYT19981219.0101",
567 | "NYT19981219.0102",
568 | "NYT19981219.0104",
569 | "NYT19981219.0106",
570 | "NYT19981219.0117",
571 | "NYT19981219.0145",
572 | "NYT19981219.0148",
573 | "NYT19981219.0170"
574 | ],
575 | "models": [
576 | "D30046.M.100.T.B",
577 | "D30046.M.100.T.C",
578 | "D30046.M.100.T.E",
579 | "D30046.M.100.T.H"
580 | ]
581 | },
582 | "D30047": {
583 | "inputs": [
584 | "NYT19981113.0404",
585 | "APW19981119.0252",
586 | "APW19981120.0290",
587 | "NYT19981120.0427",
588 | "APW19981120.0892",
589 | "APW19981121.0727",
590 | "APW19981207.0577",
591 | "APW19981207.0578",
592 | "APW19981207.0580",
593 | "APW19981209.1470"
594 | ],
595 | "models": [
596 | "D30047.M.100.T.B",
597 | "D30047.M.100.T.C",
598 | "D30047.M.100.T.F",
599 | "D30047.M.100.T.H"
600 | ]
601 | },
602 | "D30048": {
603 | "inputs": [
604 | "APW19981016.0655",
605 | "APW19981017.0692",
606 | "APW19981019.0550",
607 | "NYT19981020.0380",
608 | "NYT19981020.0382",
609 | "APW19981020.1106",
610 | "APW19981021.1160",
611 | "APW19981021.1170",
612 | "APW19981022.1123",
613 | "NYT19981024.0050"
614 | ],
615 | "models": [
616 | "D30048.M.100.T.B",
617 | "D30048.M.100.T.C",
618 | "D30048.M.100.T.G",
619 | "D30048.M.100.T.H"
620 | ]
621 | },
622 | "D30049": {
623 | "inputs": [
624 | "APW19981023.0254",
625 | "NYT19981028.0441",
626 | "NYT19981114.0099",
627 | "NYT19981118.0464",
628 | "NYT19981121.0041",
629 | "NYT19981121.0117",
630 | "APW19981121.0131",
631 | "APW19981121.0344",
632 | "NYT19981122.0111",
633 | "APW19981203.0321"
634 | ],
635 | "models": [
636 | "D30049.M.100.T.B",
637 | "D30049.M.100.T.D",
638 | "D30049.M.100.T.E",
639 | "D30049.M.100.T.G"
640 | ]
641 | },
642 | "D30050": {
643 | "inputs": [
644 | "NYT19981002.0309",
645 | "NYT19981002.0366",
646 | "NYT19981005.0479",
647 | "NYT19981006.0394",
648 | "NYT19981008.0387",
649 | "NYT19981008.0453",
650 | "NYT19981009.0434",
651 | "NYT19981010.0027",
652 | "NYT19981010.0149",
653 | "NYT19981010.0151"
654 | ],
655 | "models": [
656 | "D30050.M.100.T.B",
657 | "D30050.M.100.T.D",
658 | "D30050.M.100.T.E",
659 | "D30050.M.100.T.H"
660 | ]
661 | },
662 | "D30051": {
663 | "inputs": [
664 | "APW19981108.0803",
665 | "APW19981111.0631",
666 | "NYT19981114.0079",
667 | "APW19981116.0213",
668 | "APW19981116.0231",
669 | "APW19981116.0525",
670 | "APW19981121.0514",
671 | "APW19981129.0969",
672 | "APW19981130.0803",
673 | "NYT19981202.0391"
674 | ],
675 | "models": [
676 | "D30051.M.100.T.B",
677 | "D30051.M.100.T.D",
678 | "D30051.M.100.T.F",
679 | "D30051.M.100.T.H"
680 | ]
681 | },
682 | "D30053": {
683 | "inputs": [
684 | "NYT19981127.0267",
685 | "APW19981206.0557",
686 | "APW19981206.0559",
687 | "APW19981207.1390",
688 | "APW19981208.0312",
689 | "APW19981210.0305",
690 | "APW19981211.0628",
691 | "APW19981212.0161",
692 | "APW19981213.0224",
693 | "APW19981216.0275"
694 | ],
695 | "models": [
696 | "D30053.M.100.T.B",
697 | "D30053.M.100.T.E",
698 | "D30053.M.100.T.F",
699 | "D30053.M.100.T.G"
700 | ]
701 | },
702 | "D30055": {
703 | "inputs": [
704 | "NYT19981004.0064",
705 | "APW19981004.0320",
706 | "NYT19981005.0306",
707 | "APW19981005.0721",
708 | "APW19981006.0270",
709 | "NYT19981009.0371",
710 | "APW19981009.0494",
711 | "APW19981009.0501",
712 | "APW19981009.0525",
713 | "APW19981012.0281"
714 | ],
715 | "models": [
716 | "D30055.M.100.T.B",
717 | "D30055.M.100.T.E",
718 | "D30055.M.100.T.F",
719 | "D30055.M.100.T.H"
720 | ]
721 | },
722 | "D30056": {
723 | "inputs": [
724 | "APW19981016.0448",
725 | "APW19981208.0906",
726 | "APW19981209.0688",
727 | "APW19981209.1423",
728 | "APW19981209.1425",
729 | "APW19981210.0940",
730 | "APW19981211.1223",
731 | "APW19981212.0189",
732 | "APW19981212.0191",
733 | "APW19981212.0597"
734 | ],
735 | "models": [
736 | "D30056.M.100.T.B",
737 | "D30056.M.100.T.E",
738 | "D30056.M.100.T.G",
739 | "D30056.M.100.T.H"
740 | ]
741 | },
742 | "D30059": {
743 | "inputs": [
744 | "APW19981120.1199",
745 | "APW19981120.1224",
746 | "APW19981120.1237",
747 | "APW19981120.1239",
748 | "APW19981121.0482",
749 | "NYT19981122.0110",
750 | "NYT19981122.0194",
751 | "APW19981122.0610",
752 | "APW19981123.0274",
753 | "APW19981124.0233"
754 | ],
755 | "models": [
756 | "D30059.M.100.T.B",
757 | "D30059.M.100.T.F",
758 | "D30059.M.100.T.G",
759 | "D30059.M.100.T.H"
760 | ]
761 | },
762 | "D31001": {
763 | "inputs": [
764 | "APW19981008.0841",
765 | "APW19981026.0485",
766 | "APW19981026.0787",
767 | "APW19981028.0231",
768 | "NYT19981028.0331",
769 | "NYT19981029.0366",
770 | "NYT19981031.0150",
771 | "APW19981031.0742",
772 | "NYT19981107.0056",
773 | "APW19981109.0767"
774 | ],
775 | "models": [
776 | "D31001.M.100.T.C",
777 | "D31001.M.100.T.D",
778 | "D31001.M.100.T.E",
779 | "D31001.M.100.T.G"
780 | ]
781 | },
782 | "D31008": {
783 | "inputs": [
784 | "NYT19981009.0436",
785 | "NYT19981009.0470",
786 | "NYT19981011.0203",
787 | "NYT19981012.0357",
788 | "NYT19981012.0359",
789 | "NYT19981013.0277",
790 | "NYT19981013.0349",
791 | "NYT19981016.0233",
792 | "NYT19981016.0257",
793 | "NYT19981016.0342"
794 | ],
795 | "models": [
796 | "D31008.M.100.T.C",
797 | "D31008.M.100.T.D",
798 | "D31008.M.100.T.E",
799 | "D31008.M.100.T.H"
800 | ]
801 | },
802 | "D31009": {
803 | "inputs": [
804 | "APW19981111.0309",
805 | "APW19981111.1240",
806 | "APW19981119.0529",
807 | "NYT19981202.0315",
808 | "APW19981202.0880",
809 | "APW19981203.0322",
810 | "APW19981209.0696",
811 | "APW19981219.0504",
812 | "APW19981221.1044",
813 | "APW19981228.0740"
814 | ],
815 | "models": [
816 | "D31009.M.100.T.B",
817 | "D31009.M.100.T.C",
818 | "D31009.M.100.T.F",
819 | "D31009.M.100.T.G"
820 | ]
821 | },
822 | "D31013": {
823 | "inputs": [
824 | "NYT19981102.0465",
825 | "NYT19981104.0619",
826 | "NYT19981104.0623",
827 | "NYT19981105.0521",
828 | "NYT19981106.0565",
829 | "APW19981109.0728",
830 | "NYT19981110.0442",
831 | "NYT19981112.0195",
832 | "NYT19981114.0057",
833 | "NYT19981114.0129"
834 | ],
835 | "models": [
836 | "D31013.M.100.T.C",
837 | "D31013.M.100.T.D",
838 | "D31013.M.100.T.G",
839 | "D31013.M.100.T.H"
840 | ]
841 | },
842 | "D31022": {
843 | "inputs": [
844 | "APW19981030.0230",
845 | "APW19981030.0470",
846 | "APW19981030.0489",
847 | "APW19981030.0792",
848 | "APW19981030.1037",
849 | "APW19981030.1041",
850 | "APW19981030.1046",
851 | "APW19981030.1066",
852 | "APW19981031.0314",
853 | "APW19981031.0551"
854 | ],
855 | "models": [
856 | "D31022.M.100.T.C",
857 | "D31022.M.100.T.E",
858 | "D31022.M.100.T.F",
859 | "D31022.M.100.T.G"
860 | ]
861 | },
862 | "D31026": {
863 | "inputs": [
864 | "NYT19981010.0022",
865 | "NYT19981014.0038",
866 | "NYT19981016.0291",
867 | "NYT19981017.0014",
868 | "NYT19981017.0027",
869 | "NYT19981017.0047",
870 | "NYT19981017.0052",
871 | "NYT19981017.0093",
872 | "NYT19981017.0132",
873 | "NYT19981018.0014"
874 | ],
875 | "models": [
876 | "D31026.M.100.T.C",
877 | "D31026.M.100.T.E",
878 | "D31026.M.100.T.F",
879 | "D31026.M.100.T.H"
880 | ]
881 | },
882 | "D31031": {
883 | "inputs": [
884 | "NYT19981001.0442",
885 | "NYT19981001.0499",
886 | "NYT19981011.0194",
887 | "NYT19981013.0339",
888 | "NYT19981013.0341",
889 | "NYT19981013.0399",
890 | "NYT19981013.0427",
891 | "NYT19981016.0245",
892 | "NYT19981016.0293",
893 | "NYT19981016.0312"
894 | ],
895 | "models": [
896 | "D31031.M.100.T.C",
897 | "D31031.M.100.T.F",
898 | "D31031.M.100.T.G",
899 | "D31031.M.100.T.H"
900 | ]
901 | },
902 | "D31032": {
903 | "inputs": [
904 | "APW19981001.0299",
905 | "APW19981007.0563",
906 | "APW19981011.0535",
907 | "APW19981012.0522",
908 | "APW19981012.1126",
909 | "APW19981013.0282",
910 | "APW19981014.0819",
911 | "APW19981015.0170",
912 | "APW19981015.0569",
913 | "APW19981016.0209"
914 | ],
915 | "models": [
916 | "D31032.M.100.T.D",
917 | "D31032.M.100.T.E",
918 | "D31032.M.100.T.F",
919 | "D31032.M.100.T.G"
920 | ]
921 | },
922 | "D31033": {
923 | "inputs": [
924 | "NYT19981007.0302",
925 | "NYT19981009.0282",
926 | "NYT19981012.0400",
927 | "NYT19981018.0065",
928 | "NYT19981018.0079",
929 | "NYT19981018.0084",
930 | "NYT19981018.0089",
931 | "NYT19981018.0091",
932 | "NYT19981018.0093",
933 | "NYT19981024.0043"
934 | ],
935 | "models": [
936 | "D31033.M.100.T.D",
937 | "D31033.M.100.T.E",
938 | "D31033.M.100.T.F",
939 | "D31033.M.100.T.H"
940 | ]
941 | },
942 | "D31038": {
943 | "inputs": [
944 | "APW19981023.0569",
945 | "NYT19981025.0187",
946 | "NYT19981026.0361",
947 | "NYT19981104.0491",
948 | "APW19981104.0772",
949 | "APW19981105.0282",
950 | "NYT19981110.0432",
951 | "APW19981111.0288",
952 | "APW19981112.0551",
953 | "APW19981120.0887"
954 | ],
955 | "models": [
956 | "D31038.M.100.T.D",
957 | "D31038.M.100.T.E",
958 | "D31038.M.100.T.G",
959 | "D31038.M.100.T.H"
960 | ]
961 | },
962 | "D31043": {
963 | "inputs": [
964 | "APW19981006.0802",
965 | "APW19981007.0567",
966 | "APW19981013.0301",
967 | "APW19981015.0163",
968 | "APW19981015.0167",
969 | "APW19981128.0178",
970 | "APW19981129.0871",
971 | "APW19981129.0896",
972 | "APW19981130.1079",
973 | "APW19981130.1085"
974 | ],
975 | "models": [
976 | "D31043.M.100.T.D",
977 | "D31043.M.100.T.F",
978 | "D31043.M.100.T.G",
979 | "D31043.M.100.T.H"
980 | ]
981 | },
982 | "D31050": {
983 | "inputs": [
984 | "APW19981005.0231",
985 | "APW19981220.0356",
986 | "APW19981220.0578",
987 | "APW19981221.0236",
988 | "NYT19981221.0377",
989 | "APW19981221.0719",
990 | "APW19981221.0757",
991 | "NYT19981222.0021",
992 | "APW19981223.0717",
993 | "APW19981224.0149"
994 | ],
995 | "models": [
996 | "D31050.M.100.T.E",
997 | "D31050.M.100.T.F",
998 | "D31050.M.100.T.G",
999 | "D31050.M.100.T.H"
1000 | ]
1001 | }
1002 | }
1003 |
1004 |
--------------------------------------------------------------------------------