├── sumpy ├── data │ ├── smart_common_words.txt.gz │ ├── mead_example_docs │ │ ├── 41.docsent │ │ ├── 87.docsent │ │ └── 81.docsent │ ├── duc07_task2.json │ ├── duc03_task2.json │ └── duc04_task2.json ├── simple.py ├── system │ ├── __init__.py │ ├── _graph.py │ ├── _submodular.py │ ├── _baseline.py │ └── _base.py ├── annotators │ ├── __init__.py │ ├── _annotator_base.py │ ├── _preprocessor.py │ ├── _submodular.py │ └── _feature_extractors.py ├── document.py ├── io.py ├── eval.py ├── preprocessor.py ├── __init__.py └── util.py ├── NOTICE ├── .gitignore ├── setup.py ├── README.md ├── duc_testbed.py └── LICENSE /sumpy/data/smart_common_words.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kedz/sumpy/HEAD/sumpy/data/smart_common_words.txt.gz -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Sumpy: a multidocument summarization library for python. 2 | Copyright 2015 Chris Kedzie 3 | 4 | This product includes software developed at 5 | Columbia University. 6 | -------------------------------------------------------------------------------- /sumpy/simple.py: -------------------------------------------------------------------------------- 1 | import sumpy 2 | 3 | def lede(inputs): 4 | s = sumpy.system.LedeSummarizer() 5 | return s.summarize(inputs) 6 | 7 | def centroid(inputs): 8 | s = sumpy.system.CentroidSummarizer() 9 | return s.summarize(inputs) 10 | 11 | def mmr(inputs): 12 | s = sumpy.system.MMRSummarizer() 13 | return s.summarize(inputs) 14 | 15 | def textrank(inputs): 16 | s = sumpy.system.TextRankSummarizer() 17 | return s.summarize(inputs) 18 | 19 | def lexrank(inputs): 20 | s = sumpy.system.LexRankSummarizer() 21 | return s.summarize(inputs) 22 | -------------------------------------------------------------------------------- /sumpy/system/__init__.py: -------------------------------------------------------------------------------- 1 | from sumpy.system._base import AverageFeatureRankerBase 2 | from sumpy.system._baseline import (LedeSummarizer, CentroidSummarizer, 3 | MMRSummarizer) 4 | from sumpy.system._graph import TextRankSummarizer, LexRankSummarizer 5 | from sumpy.system._submodular import MonotoneSubmodularBasic, SubmodularMMRSummarizer 6 | 7 | __all__ = ["LedeSummarizer", "CentroidSummarizer", "MMRSummarizer", 8 | "TextRankSummarizer", "LexRankSummarizer", 9 | "MonotoneSubmodularBasic", "SubmodularMMRSummarizer, AverageFeatureRankerBase"] 10 | -------------------------------------------------------------------------------- /sumpy/annotators/__init__.py: -------------------------------------------------------------------------------- 1 | from sumpy.annotators._preprocessor import (SentenceTokenizerMixin, 2 | WordTokenizerMixin, RawBOWMixin, BinaryBOWMixin, TfIdfMixin, 3 | TfIdfCosineSimilarityMixin) 4 | from sumpy.annotators._feature_extractors import (LedeMixin, TextRankMixin, 5 | LexRankMixin, CentroidMixin, MMRMixin) 6 | from sumpy.annotators._submodular import MonotoneSubmodularMixin, SubmodularMMRMixin 7 | 8 | 9 | __all__ = ['SentenceTokenizerMixin', 'WordTokenizerMixin', 'RawBOWMixin', 10 | 'BinaryBOWMixin', 'TfIdfMixin', 'TfIdfCosineSimilarityMixin', 11 | 'LedeMixin', 'TextRankMixin', 'LexRankMixin', 'CentroidMixin', 12 | 'MMRMixin', 'MonotoneSubmodularMixin'] 13 | -------------------------------------------------------------------------------- /sumpy/annotators/_annotator_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | 3 | class _AnnotatorBase(object): 4 | __metaclass__ = ABCMeta 5 | 6 | @abstractmethod 7 | def requires(self): 8 | pass 9 | 10 | @abstractmethod 11 | def ndarray_requires(self): 12 | pass 13 | 14 | @abstractmethod 15 | def returns(self): 16 | pass 17 | 18 | @abstractmethod 19 | def ndarray_returns(self): 20 | pass 21 | 22 | @abstractmethod 23 | def name(self): 24 | pass 25 | 26 | @abstractmethod 27 | def build(self): 28 | pass 29 | 30 | @abstractmethod 31 | def process(self, input_df, ndarray_data): 32 | pass 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import os 3 | import sys 4 | 5 | data_dir = os.path.join(sys.prefix, "data") 6 | setup( 7 | name = 'sumpy', 8 | packages = ['sumpy', 'sumpy.system', 'sumpy.annotators'], 9 | version = '0.0.1', 10 | description = 'SUMPY: an automatic text summarization library', 11 | author='Chris Kedzie', 12 | author_email='kedzie@cs.columbia.edu', 13 | url='https://github.com/kedz/sumpy', 14 | install_requires=[ 15 | 'nltk', 'numpy', 'scipy', 'scikit-learn', 'pandas', 16 | 'networkx', 17 | ], 18 | include_package_data=True, 19 | package_data={ 20 | 'sumpy': [os.path.join(data_dir, 'smart_common_words.txt.gz'), 21 | os.path.join(data_dir, 'mead_example_docs', '41.docsent'), 22 | os.path.join(data_dir, 'mead_example_docs', '81.docsent'), 23 | os.path.join(data_dir, 'mead_example_docs', '87.docsent'), 24 | ]}, 25 | 26 | ) 27 | 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sumpy 2 | SUMPY: a python automatic text summarization library 3 | 4 | We currently have several baseline summarizers implemented: 5 | 6 | [x] lede 7 | 8 | [x] TextRank 9 | 10 | [x] LexRank 11 | 12 | [x] Centroid 13 | 14 | [x] ROUGE ngram evaluation 15 | 16 | and have plans to implement many more. 17 | 18 | TODO: 19 | 20 | [ ] ROUGE skip-gram and LCS evaluation 21 | 22 | [ ] FreqSum/SumBasic 23 | 24 | [ ] Submodular optimization based summarizers 25 | 26 | [ ] lda/distributes sentence representation based summarizer 27 | 28 | [ ] DEMS 29 | 30 | [ ] ILP based summarizers 31 | 32 | [ ] collect topic signatures/important word lists 33 | 34 | SUMPY contains several ready to use summarizers with 35 | sensible defaults. Here is a simple example to get you started: 36 | 37 | import sumpy 38 | 39 | doc1 = "This is the text for document1. It is for explanatory purposes..." 40 | doc2 = "This is another document text..." 41 | doc3 = "And yet another document..." 42 | 43 | inputs = [doc1, doc2, doc3] 44 | 45 | print "lede summarizer:" 46 | print sumpy.lede(inputs) 47 | 48 | print "\ntextrank summarizer:" 49 | print sumpy.textrank(inputs) 50 | 51 | print "\ncentroid summarizer:" 52 | print sumpy.centroid(inputs) 53 | -------------------------------------------------------------------------------- /sumpy/system/_graph.py: -------------------------------------------------------------------------------- 1 | from sumpy.system._base import _SystemBase 2 | from sumpy.annotators import TextRankMixin, LexRankMixin 3 | from sumpy.document import Summary 4 | 5 | class TextRankSummarizer(TextRankMixin, _SystemBase): 6 | 7 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None, 8 | directed=u"undirected", d=.85, tol=.0001, max_iters=20, 9 | verbose=False): 10 | self._sentence_tokenizer = sentence_tokenizer 11 | self._word_tokenizer = word_tokenizer 12 | self.directed = directed 13 | self.d = d 14 | self.tol = tol 15 | self.max_iters = max_iters 16 | super(TextRankSummarizer, self).__init__(verbose=verbose) 17 | 18 | def build_summary(self, input_df, ndarray_data): 19 | output_df = input_df.sort(["f:textrank"], ascending=False) 20 | return Summary(output_df) 21 | 22 | class LexRankSummarizer(LexRankMixin, _SystemBase): 23 | 24 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None, 25 | d=.85, tol=.0001, max_iters=20, 26 | verbose=False): 27 | self._sentence_tokenizer = sentence_tokenizer 28 | self._word_tokenizer = word_tokenizer 29 | self.d = d 30 | self.tol = tol 31 | self.max_iters = max_iters 32 | super(LexRankSummarizer, self).__init__(verbose=verbose) 33 | 34 | def build_summary(self, input_df, ndarray_data): 35 | output_df = input_df.sort(["f:lexrank"], ascending=False) 36 | return Summary(output_df) 37 | 38 | -------------------------------------------------------------------------------- /sumpy/system/_submodular.py: -------------------------------------------------------------------------------- 1 | from sumpy.system._base import _SystemBase 2 | from sumpy.annotators import MonotoneSubmodularMixin, SubmodularMMRMixin 3 | from sumpy.document import Summary 4 | 5 | class SubmodularMMRSummarizer(SubmodularMMRMixin, _SystemBase): 6 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None, 7 | lam=.3, budget_type="word", budget_size=400, scale=.2, 8 | verbose=False): 9 | self.sentence_tokenizer = sentence_tokenizer 10 | self.word_tokenizer = word_tokenizer 11 | self.lam = lam 12 | self.scale = scale 13 | self.budget_type = budget_type 14 | self.budget_size = budget_size 15 | 16 | super(SubmodularMMRSummarizer, self).__init__(verbose=verbose) 17 | 18 | def build_summary(self, input_df, ndarray_data): 19 | output_df = input_df[input_df["f:submodular-mmr"].isnull() == False] 20 | output_df = output_df.sort(["doc id", "sent id"], ascending=True) 21 | print output_df 22 | print output_df['sent text'].apply(len) 23 | return Summary(output_df) 24 | 25 | 26 | class MonotoneSubmodularBasic(MonotoneSubmodularMixin, _SystemBase): 27 | 28 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None, 29 | k=5, f_of_A=None, verbose=False): 30 | 31 | self.sentence_tokenizer = sentence_tokenizer 32 | self.word_tokenizer = word_tokenizer 33 | self.k = k 34 | self.f_of_A = f_of_A 35 | super(MonotoneSubmodularBasic, self).__init__(verbose=verbose) 36 | 37 | def build_summary(self, input_df, ndarray_data): 38 | output_df = input_df[input_df["f:monotone-submod"] == 1] 39 | output_df = output_df.sort(["doc id", "sent id"], ascending=True) 40 | return Summary(output_df) 41 | 42 | -------------------------------------------------------------------------------- /sumpy/system/_baseline.py: -------------------------------------------------------------------------------- 1 | from sumpy.system._base import _SystemBase 2 | from sumpy.annotators import (WordTokenizerMixin, LedeMixin, MMRMixin, 3 | CentroidMixin) 4 | from sumpy.document import Summary 5 | 6 | class LedeSummarizer(WordTokenizerMixin, LedeMixin, _SystemBase): 7 | 8 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None, 9 | verbose=False): 10 | self._sentence_tokenizer = sentence_tokenizer 11 | self._word_tokenizer = word_tokenizer 12 | super(LedeSummarizer, self).__init__(verbose=verbose) 13 | 14 | def build_summary(self, input_df, ndarray_data): 15 | output_df = input_df[input_df[u"f:lede"] == 1].sort( 16 | ["doc id"], ascending=True) 17 | return Summary(output_df) 18 | 19 | class CentroidSummarizer(CentroidMixin, _SystemBase): 20 | 21 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None, 22 | verbose=False): 23 | self._sentence_tokenizer = sentence_tokenizer 24 | self._word_tokenizer = word_tokenizer 25 | super(CentroidSummarizer, self).__init__(verbose=verbose) 26 | 27 | def build_summary(self, input_df, ndarray_data): 28 | output_df = input_df.sort(["f:centroid"], ascending=False) 29 | return Summary(output_df) 30 | 31 | class MMRSummarizer(MMRMixin, _SystemBase): 32 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None, 33 | lam=.4, verbose=False): 34 | self._sentence_tokenizer = sentence_tokenizer 35 | self._word_tokenizer = word_tokenizer 36 | self.lam = lam 37 | super(MMRSummarizer, self).__init__(verbose=verbose) 38 | 39 | def build_summary(self, input_df, ndarray_data): 40 | output_df = input_df.sort(["f:mmr"], ascending=False) 41 | return Summary(output_df) 42 | -------------------------------------------------------------------------------- /sumpy/document.py: -------------------------------------------------------------------------------- 1 | import re 2 | import textwrap 3 | 4 | class Summary(object): 5 | def __init__(self, df): 6 | self._df = df 7 | 8 | def budget(self, type="byte", size=600): 9 | summary = [] 10 | if size == "all": 11 | summary = self._df["sent text"].tolist() 12 | elif type == "word": 13 | remaining = size 14 | for idx, sent in self._df.iterrows(): 15 | num_words = min(len(sent["words"]), remaining) 16 | summary.append(u" ".join(sent["words"][0 : num_words])) 17 | remaining -= num_words 18 | if remaining < 1: 19 | break 20 | elif type == "byte": 21 | remaining = size 22 | for idx, sent in self._df.iterrows(): 23 | num_chars = min(len(sent["sent text"]), remaining) 24 | print num_chars 25 | summary.append(sent["sent text"][0 : num_chars]) 26 | remaining -= num_chars 27 | if remaining < 1: 28 | break 29 | return u"\n".join(textwrap.fill(u"{}) {}".format(i, sent)) 30 | for i, sent in enumerate(summary, 1)) + u" ..." 31 | 32 | def __unicode__(self): 33 | return self.budget() 34 | 35 | def __str__(self): 36 | return unicode(self).encode("utf-8") 37 | 38 | 39 | class Document(object): 40 | def __init__(self, name, text): 41 | self.name = name 42 | if isinstance(self.name, str): 43 | self.name = self.name.decode(u"utf-8") 44 | self.text = text 45 | if isinstance(self.text, str): 46 | self.text = self.text.decode(u"utf-8") 47 | 48 | def __str__(self): 49 | return unicode(self).encode(u"utf-8") 50 | 51 | def __unicode__(self): 52 | return self.name + u"\n" + self.text 53 | 54 | class DocSet(object): 55 | def __init__(self, docs): 56 | self.docs = docs 57 | -------------------------------------------------------------------------------- /duc_testbed.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | import os 4 | import sumpy 5 | import sumpy.eval 6 | 7 | def load_docsets(duc_dir): 8 | 9 | docset_paths = [os.path.join(duc_dir, fname) 10 | for fname in os.listdir(duc_dir)] 11 | docset_paths = [path for path in docset_paths if os.path.isdir(path)] 12 | docsets = {} 13 | for docset_path in docset_paths: 14 | docset_id, docs, models = load_docset(docset_path) 15 | docsets[docset_id] = {u"docs": docs, u"models": models} 16 | return docsets 17 | 18 | def load_docset(docset_path): 19 | docset_id = os.path.split(docset_path)[1] 20 | docs_path = os.path.join(docset_path, u"docs") 21 | docs = sumpy.io.load_duc_docset(docs_path) 22 | models = [] 23 | for fname in os.listdir(docset_path): 24 | if docset_id in fname: 25 | model_paths = [os.path.join(docset_path, fname, length) 26 | for length in [u"200", u"400"]] 27 | model_sums = sumpy.io.load_duc_abstractive_summaries(model_paths) 28 | models.extend(model_sums) 29 | return docset_id, docs, models 30 | 31 | 32 | def generate_summaries(systems, docsets): 33 | rouge = sumpy.eval.ROUGE(max_ngrams=2, limit=100, limit_type=u"word") 34 | results = [] 35 | for docset_id in docsets.keys(): 36 | #print docset_id 37 | docs = docsets[docset_id][u"docs"] 38 | models = docsets[docset_id][u"models"] 39 | sys_sums = [(system_name, unicode(sum_func(docs))) 40 | for system_name, sum_func in systems] 41 | df = rouge.evaluate(sys_sums, models) 42 | results.append(df) 43 | return pd.concat(results).groupby(level=0).mean() 44 | 45 | def main(duc_dir): 46 | print u"Loading DUC document sets from:", duc_dir 47 | docsets = load_docsets(duc_dir) 48 | 49 | lede = lambda x: sumpy.lede(x) 50 | centroid = lambda x: sumpy.centroid(x) 51 | lexrank = lambda x: sumpy.lexrank(x) 52 | systems = [(u"lede", lede), (u"centroid", centroid), 53 | (u"lexrank", lexrank)] 54 | print generate_summaries(systems, docsets) 55 | 56 | if __name__ == u"__main__": 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument(u"-d", u"--duc-dir", required=True, type=unicode, 59 | help=u"path to DUC document set directory") 60 | args = parser.parse_args() 61 | duc_dir = args.duc_dir 62 | main(duc_dir) 63 | 64 | -------------------------------------------------------------------------------- /sumpy/data/mead_example_docs/41.docsent: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Egyptians Suffer Second Air Tragedy in a Year 6 | 7 | CAIRO, Egypt -- The crash of a Gulf Air flight that killed 143 people in Bahrain is a disturbing deja vu for Egyptians: It is the second plane crash within a year to devastate this Arab country. 8 | Sixty-three Egyptians were on board the Airbus A320, which crashed into shallow Persian Gulf waters Wednesday night after circling and trying to land in Bahrain. 9 | On Oct. 31, 1999, a plane carrying 217 mostly Egyptian passengers crashed into the Atlantic Ocean off Massachusetts. 10 | The cause has not been determined, providing no closure to the families, whose grief was reopened this month with the release of a factual report by the National Transportation Safety Board. 11 | Walid Mourad, head of the Egyptian Pilots Association and a voice often heard in relation to the EgyptAir investigation, said Wednesday's crash is a tragedy for the Arab people as a whole. 12 | "We are all family and brothers. 13 | We all have something in this," Mourad said. 14 | "But for the Egyptians, this is a double blow. 15 | Two disasters in a row for the Egyptians." 16 | Many of the passengers on the Gulf Air flight were headed for jobs in Bahrain or elsewhere in the Gulf. 17 | Rida Hassan was one of those escaping Egypt's moribund economy for work in the oil-rich Gulf. 18 | Hassan's uncle said he rushed to the Cairo airport after hearing a list of the passengers read on television. 19 | The uncle, who would not give his name, said his nephew had come home to get married and stayed only a month. 20 | Hassan worked in a restaurant in Bahrain, his uncle said before disappearing into a room at the airport set aside for relatives desperate for news. 21 | In the hours just after the crash, relatives at the Cairo airport expressed anger and frustration at Gulf Air for the slow release of information. 22 | Women screamed and men tried vainly to calm them. 23 | "No information is being given to us. 24 | Absolutely nothing," Mohammed Ibrahim el-Naggar said hours after the crash. 25 | "We were told that there were some survivors but no names were given." 26 | El-Naggar said his cousin, her husband who works in Dubai, and their two children aged 2 and 3 were on the downed plane. 27 | Gulf Air said it was sending a special plane to carry 134 relatives to Manama airport later Thursday. 28 | "All necessary measures have been taken to receive the families of the victims," Mohammed al-Sayed Abbas, the Egyptian ambassador to Bahrain, told Egyptian television. 29 | "The embassy staff will be with them step by step until they identify the deceased." 30 | In Bahrain, relatives were beginning the wrenching process of identifying the victims from photographs taken after the bodies were retrieved from the Gulf. 31 | Egypt, which lacks the oil wealth of the Gulf and has an economy struggling to revive from decades of socialist stagnation, has a long tradition of sending workers to the Gulf to fill everything from skilled to menial jobs. 32 | Remittances from citizens working abroad make up Egypt's biggest source of foreign exchange. 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /sumpy/io.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import pandas as pd 4 | 5 | def load_duc_docset(input_source): 6 | docs = DucSgmlReader().read(input_source) 7 | return docs 8 | 9 | def load_duc_abstractive_summaries(input_source): 10 | models = DucAbstractSgmlReader().read(input_source) 11 | return models 12 | 13 | class FileInput(object): 14 | 15 | def gather_paths(self, source): 16 | """Determines the type of source and return an iterator over input 17 | document paths. If source is a str or unicode 18 | object, determine if it is also a directory and return an iterator 19 | for all directory files; otherwise treat as a single document input. 20 | If source is any other iterable, treat as an iterable of file 21 | paths.""" 22 | 23 | if isinstance(source, str) or isinstance(source, unicode): 24 | if os.path.isdir(source): 25 | paths = [os.path.join(source, fname) 26 | for fname in os.listdir(source)] 27 | for path in paths: 28 | yield path 29 | else: 30 | yield source 31 | 32 | else: 33 | try: 34 | for path in source: 35 | yield path 36 | except TypeError: 37 | print source, 'is not iterable' 38 | 39 | class DucSgmlReader(FileInput): 40 | 41 | def read(self, input_source): 42 | docs = [] 43 | for path in self.gather_paths(input_source): 44 | with open(path, u"r") as f: 45 | sgml = "".join(f.readlines()) 46 | m = re.search(r"(.*?)", sgml, flags=re.DOTALL) 47 | if m is None: 48 | raise Exception("TEXT not found in " + path) 49 | text = m.group(1).strip() 50 | text_clean = re.sub(r"<[^>]*?>", r"", text) 51 | docs.append(text_clean) 52 | return docs 53 | 54 | class DucAbstractSgmlReader(FileInput): 55 | def read(self, input_source): 56 | docs = [] 57 | for path in self.gather_paths(input_source): 58 | with open(path, u"r") as f: 59 | sgml = "".join(f.readlines()) 60 | m = re.search(r"]+>(.*?)", sgml, flags=re.DOTALL) 61 | if m is None: 62 | raise Exception("SUM not found in " + path) 63 | text = m.group(1).strip() 64 | docs.append(text) 65 | return docs 66 | 67 | class MeadDocSentReader(FileInput): 68 | docsent_patt = (r"") 70 | sent_patt = (r"(.*?)") 73 | def read(self, input_source): 74 | docs = [] 75 | for path in self.gather_paths(input_source): 76 | sents = [] 77 | with open(path, u"r") as f: 78 | xml = "".join(f.readlines()) 79 | m = re.search(self.docsent_patt, xml, flags=re.DOTALL) 80 | if m is None: 81 | raise Exception("DOCSENT not found in " + path) 82 | doc_id = m.group(1) 83 | lang = m.group(3) 84 | for s in re.finditer(self.sent_patt, xml, flags=re.DOTALL): 85 | par = int(s.group(1)) 86 | rsnt = s.group(2) 87 | sno = s.group(3) 88 | text = s.group(4).strip() 89 | if par > 1: 90 | sents.append(text) 91 | #sents.append({u"doc id": doc_id, u"sent id": int(rsnt), 92 | # u"type": u"body" if par > 1 else u"headline", 93 | # u"text": text.decode("utf-8")}) 94 | docs.append("\n".join(sents).decode("utf-8")) 95 | #df = pd.DataFrame( 96 | # sents, columns=[u"doc id", u"type", u"sent id", u"text"]) 97 | #df.set_index([u"doc id", u"sent id"], inplace=True) 98 | return docs 99 | 100 | def load_demo_docs(): 101 | import pkg_resources 102 | input_source = pkg_resources.resource_filename( 103 | "sumpy", 104 | os.path.join("data", "mead_example_docs")) 105 | return MeadDocSentReader().read(input_source) 106 | -------------------------------------------------------------------------------- /sumpy/data/mead_example_docs/87.docsent: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Prayers for victims of Bahrain crash 6 | 7 | MANAMA, Bahrain (AP) _ Three bodies wrapped in cloth, one the size of a small child, were lain before the faithful in the Grand Mosque Friday during a special prayer for the dead in honor of the 143 victims of the Gulf Air crash. 8 | Bahrain"s Prime Minister Sheik Khalifa bin Salman Al Khalifa and other top officials stood side-by-side with 2,000 Muslims reciting funeral prayers before the bodies, which were among the 107 adults and 36 children killed in Wednesday"s air disaster, said Information Ministry spokesman Syed el-Bably. 9 | Around the island, weekly Friday prayer services devoted time to funeral prayers for the passengers and crew. 10 | Across the street at the Gulf Hotel, relatives of the victims sought comfort from religious leaders and counselors as they continued the painful process of identifying loved ones from books of photographs of remains. 11 | "It"s very difficult to see the pictures," said Nadr al-Khawaja, a Bahraini whose cousin, her husband and their 9-month-old baby son were killed. 12 | "It"s very hard for the parents _ it"s torture." 13 | Salvage attempts were continuing in the shallow waters at the crash site Friday. 14 | Twenty-six U.S. divers joined Bahraini experts scouring the sandy sea floor in search for more bits of wing and fuselage from Gulf Air flight 072. 15 | At dawn Friday, the divers began searching for "diplomatic cargo" being carried by a U.S. government courier, according to Cdr. Jeff Gradeck, spokesman for the U.S. Navy"s 5th Fleet, which is based in Bahrain. 16 | The State Department has said the courier, 31-year-old Seth Foti, was carrying pouches containing classified information. 17 | By midafternoon, there was no word of their recovery. 18 | The U.S. Embassy in Bahrain was planning a private memorial service Saturday for Foti. 19 | He and his wife of three months, Anisha, met at the embassy, where she had worked briefly last year. 20 | Scraps of metal and other remnants were brought to an airport hangar where aviation experts were reconstructing the Airbus 320 for investigators, said Gulf Air spokesman Stephen Tuckwell. 21 | Both of the plane"s "black boxes" _ the flight data and voice cockpit recorders _ were to be shipped abroad for data recovery but aviation experts had not finalized plans on Friday, Gulf Air said. 22 | Tuckwell said it could take weeks before the data was recovered. 23 | Bahrain"s State television had quoted witnesses soon after the crash who described seeing a fire in one of the aircraft"s engines. 24 | Gulf Air officials said there was no fire and other witnesses have said they did not see flames. 25 | Meanwhile, the U.S. Embassy here said air accident investigators from the National Transportation and Safety Board were en route to Manama on Friday to join Bahraini investigators in determining the cause of the crash. 26 | Six French government experts and a representative of Airbus Industries arrived Thursday evening to look into the crash _ the sixth for an Airbus 320 in the last 12 years. 27 | Flight 072 crashed in shallow water near shore and Ali Ahmedi, a spokesman and an acting vice president for Gulf Air, has said the pilot gave no indication to air traffic controllers that there were any problems in the plane. 28 | Gulf Air said 135 passengers and eight crew members were on board. 29 | Sixty-three passengers were Egyptian, 34 Bahraini, 12 Saudi Arabian, nine Palestinian, six from the United Arab Emirates, three Chinese, two British and one each from Canada, Oman, Kuwait, Sudan, Australia and the United States. 30 | Two crew members were Bahrainis with one each from Oman, the Philippines, Poland, India, Morocco and Egypt. 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /sumpy/eval.py: -------------------------------------------------------------------------------- 1 | from nltk.util import ngrams 2 | from sumpy.preprocessor import (SentenceTokenizerMixin, 3 | ROUGEWordTokenizerMixin, SMARTStopWordsMixin, LengthLimiterMixin) 4 | import pandas as pd 5 | 6 | class ROUGE(SentenceTokenizerMixin, ROUGEWordTokenizerMixin, 7 | SMARTStopWordsMixin, LengthLimiterMixin): 8 | def __init__(self, sentence_tokenizer=None, word_tokenizer=None, 9 | max_ngrams=2, remove_stopwords=False, stopwords=None, 10 | show_per_model_results=False, limit=None, limit_type=None): 11 | 12 | self._sentence_tokenizer = sentence_tokenizer 13 | self._word_tokenizer = word_tokenizer 14 | self._max_ngrams = max_ngrams 15 | self.remove_stopwords = remove_stopwords 16 | self._stopwords = stopwords 17 | self._show_per_model_results = show_per_model_results 18 | self._limit = limit 19 | self._limit_type = limit_type 20 | 21 | def evaluate(self, systems, models): 22 | models = list(models) # make model order consistent 23 | sent_tokenizer = self.build_sent_tokenizer() 24 | word_tokenizer = self.build_word_tokenizer() 25 | length_limiter = self.build_length_limiter() 26 | is_stopword = self.build_stopwords() 27 | results = [] 28 | result_index = [] 29 | for name, system in systems: 30 | sys_ngram_sets = self.extract_ngrams( 31 | system, sent_tokenizer, word_tokenizer, self._max_ngrams, 32 | is_stopword, length_limiter) 33 | 34 | for model_no, model in enumerate(models, 1): 35 | model_ngram_sets = self.extract_ngrams( 36 | model, sent_tokenizer, word_tokenizer, self._max_ngrams, 37 | is_stopword, length_limiter) 38 | scores = self.compute_prf( 39 | sys_ngram_sets, model_ngram_sets, self._max_ngrams) 40 | result_index.append((name, model_no)) 41 | results.append(scores) 42 | 43 | # Collect results as a pandas DataFrame and compute the mean 44 | # performance. 45 | col_index = [] 46 | dataframe_cols = [] 47 | for i in xrange(1, self._max_ngrams + 1): 48 | rouge_n = u"ROUGE-{}".format(i) 49 | col_index.append((rouge_n, "Recall")) 50 | col_index.append((rouge_n, "Prec.")) 51 | col_index.append((rouge_n, "F1")) 52 | 53 | row_index = pd.MultiIndex.from_tuples( 54 | result_index, names=['system', 'model']) 55 | col_index = pd.MultiIndex.from_tuples(col_index) 56 | df = pd.DataFrame(results, columns=col_index, index=row_index) 57 | df2 = df.groupby(level=0).mean() 58 | if self._show_per_model_results is True: 59 | df2['model'] = 'AVG' 60 | df2 = df2.reset_index().set_index(['system','model']).append(df) 61 | df2 = df2.sort() 62 | 63 | return df2 64 | 65 | def extract_ngrams(self, text, sent_tokenizer, word_tokenizer, max_ngrams, 66 | is_stopword, length_limiter): 67 | ngram_sets = {} 68 | sents = sent_tokenizer(text) 69 | 70 | tokens = [] 71 | for sent in sents: 72 | tokens.extend([word.lower() for word in word_tokenizer(sent)]) 73 | 74 | # Remove stopwords. 75 | tokens = [word for word in tokens if is_stopword(word) is False] 76 | tokens = length_limiter(tokens) 77 | 78 | for i in xrange(1, max_ngrams + 1): 79 | ngram_sets[i] = {} 80 | total = 0 81 | for ngram in ngrams(tokens, i): 82 | ngram_sets[i][ngram] = ngram_sets[i].get(ngram, 0) + 1 83 | total += 1 84 | ngram_sets[i][u"__TOTAL__"] = total 85 | return ngram_sets 86 | 87 | def compute_prf(self, sys_ngram_sets, model_ngram_sets, max_ngrams): 88 | scores = [] 89 | for i in xrange(1, max_ngrams + 1): 90 | intersect = 0 91 | for ngram, model_ngram_count in model_ngram_sets[i].items(): 92 | if ngram == "__TOTAL__": 93 | continue 94 | sys_ngram_count = sys_ngram_sets[i].get(ngram, 0) 95 | intersect += min(model_ngram_count, sys_ngram_count) 96 | recall = float(intersect) / model_ngram_sets[i][u"__TOTAL__"] 97 | prec = float(intersect) / sys_ngram_sets[i][u"__TOTAL__"] 98 | 99 | if intersect == 0: 100 | print "Warning: 0 {}-gram overlap".format(i) 101 | f1 = 0 102 | else: 103 | f1 = 2 * prec * recall / (prec + recall) 104 | scores.append(recall) 105 | scores.append(prec) 106 | scores.append(f1) 107 | 108 | return scores 109 | -------------------------------------------------------------------------------- /sumpy/preprocessor.py: -------------------------------------------------------------------------------- 1 | import nltk.data 2 | from nltk.tokenize import WordPunctTokenizer 3 | from sklearn.feature_extraction.text import TfidfVectorizer 4 | import re 5 | import gzip 6 | import pkg_resources 7 | import os 8 | 9 | class SentenceTokenizerMixin(object): 10 | def build_sent_tokenizer(self): 11 | """Return a function that splits a string into a sequence of 12 | sentences.""" 13 | if self._sentence_tokenizer is not None: 14 | tok = self._sentence_tokenizer 15 | else: 16 | tok = nltk.data.load('tokenizers/punkt/english.pickle').tokenize 17 | return tok 18 | 19 | 20 | class WordTokenizerMixin(object): 21 | def build_word_tokenizer(self): 22 | """Return a function that splits a string into a sequence of words.""" 23 | if self._word_tokenizer is not None: 24 | tokenize = self._word_tokenizer 25 | else: 26 | tokenize = WordPunctTokenizer().tokenize 27 | return tokenize 28 | 29 | 30 | class ROUGEWordTokenizerMixin(object): 31 | def build_word_tokenizer(self): 32 | """This mixin provides the same reg-ex based word tokenizer that is 33 | used in the official ROUGE perl script (Lin, 2004). See the readText 34 | subroutine (line 1816) of ROUGE-1.5.5.pl for reference.""" 35 | if self._word_tokenizer is not None: 36 | tokenize = self._word_tokenizer 37 | else: 38 | def rouge_tokenize(sentence): 39 | s = re.sub(r"-", r" -", sentence, flags=re.UNICODE) 40 | s = re.sub(r"[^A-Za-z0-9\-]", r" ", s, flags=re.UNICODE) 41 | s = s.strip() 42 | s = re.sub(r"\s+", r" ", s, flags=re.UNICODE) 43 | return s.split(u" ") 44 | tokenize = rouge_tokenize 45 | return tokenize 46 | 47 | class CorpusTfidfMixin(object): 48 | def build_tfidf_vectorizer(self): 49 | self._tfidf_vectorizer = TfidfVectorizer(analyzer=lambda x: x) 50 | return self._tfidf_vectorizer.fit_transform 51 | 52 | class TextAnalyzerMixin(object): 53 | 54 | def build_analyzer(self): 55 | sent_tokenize = self._build_sent_tokenizer() 56 | word_tokenize = self._build_word_tokenizer() 57 | stem = self._build_stemmer() 58 | def analyzer(text): 59 | sents = sent_tokenize(text) 60 | tokenized_sents = [[stem(word) for word in word_tokenize(sent)] 61 | for sent in sents] 62 | return tokenized_sents, sents 63 | return analyzer 64 | 65 | def _build_sent_tokenizer(self): 66 | """Return a function that splits a string into a sequence of 67 | sentences.""" 68 | if self._sentence_tokenizer is not None: 69 | return self._sentence_tokenizer 70 | else: 71 | return nltk.data.load('tokenizers/punkt/english.pickle').tokenize 72 | 73 | def _build_word_tokenizer(self): 74 | """Return a function that splits a string into a sequence of words.""" 75 | if self._word_tokenizer is not None: 76 | tokenize = self._word_tokenizer 77 | else: 78 | tokenize = WordPunctTokenizer().tokenize 79 | 80 | return tokenize 81 | 82 | def _build_stemmer(self): 83 | if self._stemmer is not None: 84 | return self._stemmer 85 | else: return lambda w: w 86 | 87 | class SMARTStopWordsMixin(object): 88 | def build_stopwords(self): 89 | if self.remove_stopwords is True: 90 | if self._stopwords is None: 91 | path = pkg_resources.resource_filename( 92 | "sumpy", 93 | os.path.join("data", "smart_common_words.txt.gz")) 94 | with gzip.open(path, u"r") as f: 95 | self._stopwords = set( 96 | [word.strip().decode(u"utf-8").lower() 97 | for word in f.readlines()]) 98 | return lambda word: word in self._stopwords 99 | else: 100 | return lambda word: False 101 | 102 | class LengthLimiterMixin(object): 103 | def build_length_limiter(self): 104 | """ 105 | Return a function that shortens a list of tokens to a 106 | desired length. 107 | """ 108 | if self._limit is None and self._limit_type is not None: 109 | raise Exception("Both limit and limit_type must be set.") 110 | if self._limit is not None and self._limit_type is None: 111 | raise Exception("Both limit and limit_type must be set.") 112 | if self._limit_type not in [None, u"word"]: 113 | raise Exception( 114 | "limit_type: {} not implemented.".format(self._limit_type)) 115 | 116 | if self._limit_type is None: 117 | # Do not shorten, just return tokens unchanged. 118 | return lambda x: x 119 | if self._limit_type == u"word": 120 | # Shorten list to be `_limit` tokens long. 121 | def word_limiter(sequence): 122 | if len(sequence) < self._limit: 123 | print "Warning: document is shorter than the max length" \ 124 | + " limit. This can effect evaluation negatively." 125 | return sequence[:self._limit] 126 | return word_limiter 127 | -------------------------------------------------------------------------------- /sumpy/annotators/_preprocessor.py: -------------------------------------------------------------------------------- 1 | from sumpy.annotators._annotator_base import _AnnotatorBase 2 | import pkg_resources 3 | import gzip 4 | import os 5 | import pandas as pd 6 | import nltk 7 | from nltk.tokenize import WordPunctTokenizer 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | from sklearn.feature_extraction.text import CountVectorizer 10 | from sklearn.metrics.pairwise import cosine_similarity 11 | import re 12 | 13 | 14 | class SentenceTokenizerMixin(_AnnotatorBase): 15 | """ 16 | Analyze method takes a string (an article text usually) and splits it 17 | into substrings corresponding to the sentences in the origial article. 18 | """ 19 | 20 | def requires(self): 21 | return ["doc text"] 22 | 23 | def ndarray_requires(self): 24 | return [] 25 | 26 | def returns(self): 27 | return ["sent id", "sent text"] 28 | 29 | def ndarray_returns(self): 30 | return [] 31 | 32 | def name(self): 33 | return "SentenceTokenizerMixin" 34 | 35 | def build(self): 36 | 37 | if not hasattr(self, "_sentence_tokenizer"): 38 | self._sentence_tokenizer = None 39 | 40 | if self._sentence_tokenizer is None: 41 | dl = nltk.downloader.Downloader() 42 | if dl.is_installed("punkt") is False: 43 | print "Installing NLTK Punkt Sentence Tokenizer" 44 | dl.download("punkt") 45 | 46 | self._sentence_tokenizer = nltk.data.load( 47 | 'tokenizers/punkt/english.pickle').tokenize 48 | 49 | def process(self, input_df, ndarray_data): 50 | def split_text(group): 51 | row = group.irow(0) 52 | sents = self._sentence_tokenizer(row["doc text"]) 53 | return pd.DataFrame([{"doc id": row["doc id"], 54 | "sent id": i, "sent text": sent} 55 | for i, sent in enumerate(sents, 1)]) 56 | 57 | processed_df = input_df.groupby( 58 | "doc id", group_keys=False).apply(split_text) 59 | 60 | cols = input_df.columns.difference(processed_df.columns).tolist() 61 | cols += ["doc id"] 62 | output_df = input_df[cols].merge( 63 | processed_df, on="doc id", how="inner") 64 | return output_df, ndarray_data 65 | 66 | class WordTokenizerMixin(SentenceTokenizerMixin): 67 | """Analyze method takes a string (corresponding to a sentence) and splits 68 | it into substrings corresponding to the words in original aritcle.""" 69 | 70 | def build(self): 71 | 72 | if not hasattr(self, "_word_tokenizer"): 73 | self._word_tokenizer = None 74 | 75 | if self._word_tokenizer is None: 76 | self._word_tokenizer = WordPunctTokenizer().tokenize 77 | 78 | def process(self, input_df, ndarray_data): 79 | input_df["words"] = input_df["sent text"].apply( 80 | self._word_tokenizer) 81 | return input_df, ndarray_data 82 | 83 | def requires(self): 84 | return ["sent id", "sent text"] 85 | 86 | def ndarray_requires(self): 87 | return [] 88 | 89 | def returns(self): 90 | return ["words"] 91 | 92 | def ndarray_returns(self): 93 | return [] 94 | 95 | def name(self): 96 | return "WordTokenizerMixin" 97 | 98 | class RawBOWMixin(WordTokenizerMixin): 99 | 100 | def build(self): 101 | 102 | if not hasattr(self, "_count_vectorizer"): 103 | self._count_vectorizer = None 104 | 105 | if self._count_vectorizer is None: 106 | self._count_vectorizer = CountVectorizer( 107 | input=u"content", preprocessor=lambda x: x, 108 | tokenizer=lambda x: x) 109 | 110 | def process(self, input_df, ndarray_data): 111 | ndarray_data["RawBOWMatrix"] = self._count_vectorizer.fit_transform( 112 | input_df["words"].tolist()) 113 | return input_df, ndarray_data 114 | 115 | def requires(self): 116 | return ["words"] 117 | 118 | def returns(self): 119 | return [] 120 | 121 | def ndarray_requires(self): 122 | return [] 123 | 124 | def ndarray_returns(self): 125 | return ["RawBOWMatrix"] 126 | 127 | def name(self): 128 | return "RawBOWMixin" 129 | 130 | class BinaryBOWMixin(RawBOWMixin): 131 | 132 | def build(self): 133 | pass 134 | 135 | def process(self, input_df, ndarray_data): 136 | X = ndarray_data["RawBOWMatrix"].copy() 137 | X[X > 0] = 1 138 | ndarray_data["BinaryBOWMatrix"] = X 139 | return input_df, ndarray_data 140 | 141 | def requires(self): 142 | return [] 143 | 144 | def returns(self): 145 | return [] 146 | 147 | def ndarray_requires(self): 148 | return ["RawBOWMatrix",] 149 | 150 | def ndarray_returns(self): 151 | return ["BinaryBOWMatrix"] 152 | 153 | def name(self): 154 | return "BinaryBOWMixin" 155 | 156 | class TfIdfMixin(RawBOWMixin): 157 | def build(self): 158 | if not hasattr(self, "_tfidf_transformer"): 159 | self._tfidf_transformer = None 160 | 161 | if self._tfidf_transformer is None: 162 | self._tfidf_transformer = TfidfTransformer() 163 | #input=u"content", preprocessor=lambda x: x, 164 | #tokenizer=lambda x: x) 165 | 166 | def process(self, input_df, ndarray_data): 167 | X = self._tfidf_transformer.fit_transform( 168 | ndarray_data["RawBOWMatrix"]) 169 | ndarray_data["TfIdfMatrix"] = X 170 | return input_df, ndarray_data 171 | 172 | def requires(self): 173 | return [] 174 | 175 | def returns(self): 176 | return [] 177 | 178 | def ndarray_requires(self): 179 | return ["RawBOWMatrix",] 180 | 181 | def ndarray_returns(self): 182 | return ["TfIdfMatrix"] 183 | 184 | def name(self): 185 | return "TfIdfMixin" 186 | 187 | class TfIdfCosineSimilarityMixin(TfIdfMixin): 188 | 189 | def build(self): 190 | pass 191 | 192 | def process(self, input_df, ndarray_data): 193 | K = cosine_similarity(ndarray_data["TfIdfMatrix"]) 194 | ndarray_data["TfIdfCosSimMatrix"] = K 195 | return input_df, ndarray_data 196 | 197 | def requires(self): 198 | return [] 199 | 200 | def returns(self): 201 | return [] 202 | 203 | def ndarray_requires(self): 204 | return ["TfIdfMatrix"] 205 | 206 | def ndarray_returns(self): 207 | return ["TfIdfCosSimMatrix"] 208 | 209 | def name(self): 210 | return "TfIdfCosineSimilarityMixin" 211 | 212 | 213 | -------------------------------------------------------------------------------- /sumpy/data/mead_example_docs/81.docsent: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | One American among 143 dead in crash 6 | 7 | MINA SALMAN PORT, Bahrain (AP) -- A man's black shoe, a plastic sandal and bits of yellow foam padding bobbed Thursday in the waters off this tiny island nation, where families were burying loved ones a day after Gulf Air Flight 072 crashed, killing all 143 aboard. 8 | Bahraini authorities and U.S. Navy divers based in the Gulf recovered both ''black boxes'' - the flight data and voice cockpit recorders - near where the plane slammed into shallow water off Bahrain's shore. 9 | Neither box appeared damaged, according to Bahrain civil defense chief James Windsor, who received the voice cockpit recorder Thursday from U.S. Navy divers who brought it to shore. 10 | Authorities were awaiting the arrival of experts from the U.S. National Transportation Safety Board for help with the Bahraini-led investigation. 11 | Six French government experts and an Airbus Industries representative flew in Thursday evening. 12 | Ali Ahmedi, a spokesman and an acting vice president for Gulf Air, said it was too early to speculate on what caused the plane to crash as it circled the airport before coming in to land. 13 | But he said there was no indication the pilot was anticipating an emergency landing. 14 | ''The pilot did not make any kind of statements of problems in the plane,'' Ahmedi said. 15 | Transportation Minister Sheik Ali bin Khalifa Al Khalifa said he was hopeful the black boxes would provide some clues. 16 | ''Any news, anything out of it would be a help,'' he said. 17 | Under the best of circumstances, a water landing is risky, said Michael Barr, director of the aviation program at the University of Southern California. 18 | Even a pilot coming in relatively slowly onto the water, hoping to skip across its surface like a stone tossed by a child, could clip a wing and lose control, he said. 19 | And the depth of the water would make little difference to the landing, experts said: A large airplane that crashes at high speed is going to be destroyed, whatever it hits. 20 | Evidence of that destruction lay off Bahrain on Thursday. 21 | In waters often less than 10 feet deep, shadowy bits of wing and fuselage, mostly in small pieces, were resting on the sandy sea floor. 22 | A few recognizable pieces of the Gulf Air Airbus 320 protruded from the water: a ripped tail wing with the airline's black, red and gold logo, skin of the fuselage with the letters 'LF AIR' above the surface. 23 | Most traces of the 143 victims were collected in the hours after the Cairo-to-Bahrain flight crashed Wednesday evening. 24 | Luggage and clothing that floated to the surface were removed so they wouldn't be swept away with the tides. 25 | Like the plane, many of the bodies were shattered, and relatives struggled to identify loved ones so they could claim their remains for burial. 26 | At a hotel in the capital, relatives sobbed as a Gulf Air official, his voice choking, read out names of their loved ones listed as victims. 27 | Family members were asked to make identifications from photos taken after the bodies were recovered. 28 | ''This is the worst day of my life. 29 | I lost a part of me,'' said Khalifa al-Hashil, 45, of Saudi Arabia. 30 | His 35-year-old brother, Mohammed, died in the crash. 31 | Fifteen victims were buried Thursday at Manama Cemetery, the country's largest. 32 | Mohammed Jassim, 45, an undertaker at the cemetery, washed disfigured faces and mutilated bodies with rose water before the remains - still in body bags tagged at a makeshift morgue - were placed in freshly dug graves. 33 | ''It's a painful sight,'' he said. 34 | ''I've handled dead bodies before, but none so dreadful to look at.'' 35 | In 15-minute intervals, white Health Ministry vans pulled up at the cemetery to unload victims in tagged body bags. 36 | Chants of ''God is Great'' and mournful wails wafted over the cemetery during the burial. 37 | Relatives offered prayers for the dead, standing side by side, while others wept on each other's shoulders as clerics tried to comfort them. 38 | Thirty-six of the 143 victims were children, officials said. 39 | All appeared to have been traveling with their families. 40 | Many families in the region are ending vacations at this time of year, which could account for the large number of children aboard. 41 | Amjad Obaid, a physician, was burying his sister-in-law, 4-year-old niece and 10-year-old nephew. 42 | He said a disaster alert on his pager had summoned him to work. 43 | ''Only when I got to the hospital I found out that this plane carried my brother's wife and her children,'' Obaid said. 44 | They had been returning from a vacation in Egypt. 45 | After the crash, U.S. Navy helicopters, small boats and an oceangoing tug quickly joined the nighttime search and rescue effort a few miles off the northern coast of Bahrain. 46 | The island is the headquarters of the U.S. Navy's 5th Fleet. 47 | Bahraini Crown Prince Sheik Salman bin Hamad Al Khalifa personally directed the effort, the U.S. military said. 48 | Gulf Air said 135 passengers and eight crew members were on board. 49 | They included 64 Egyptians, 36 Bahrainis, 12 Saudi Arabians, nine Palestinians, six from the United Arab Emirates, three Chinese, two British and one each from the United States, Canada, Oman, Kuwait, Sudan, Australia, Oman, the Philippines, Poland, India and Morocco. 50 | The American killed in the crash was 31-year-old Seth J. Foti, a diplomatic courier carrying classified information in yellow pouches, the State Department said. 51 | Foti had joined the service 14 months ago, spokesman Richard Boucher said. 52 | He said he did not know what Foti had with him when the plane went down. 53 | ''His dedication to the mission of the courier service was unmatched, and he was clearly an asset to the Department of State and the U.S. government,'' Boucher said. 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /sumpy/system/_base.py: -------------------------------------------------------------------------------- 1 | from sumpy.annotators._annotator_base import _AnnotatorBase 2 | from sumpy.annotators import SentenceTokenizerMixin, WordTokenizerMixin 3 | from sumpy.document import Summary 4 | from abc import ABCMeta, abstractmethod 5 | import pandas as pd 6 | import numpy as np 7 | import networkx as nx 8 | 9 | class _SystemBase(object): 10 | """Abstract base class for summarizer systems.""" 11 | 12 | __metaclass__ = ABCMeta 13 | 14 | def __init__(self, verbose=False): 15 | self.verbose = verbose 16 | self._dependency_graph = None 17 | self._annotators = None 18 | self._pipeline = None 19 | 20 | @abstractmethod 21 | def build_summary(self, input_df, ndarray_data): 22 | pass 23 | 24 | def summarize(self, inputs): 25 | 26 | if not hasattr(self, "_pipeline") or self._pipeline is None: 27 | self.build_pipeline() 28 | 29 | input_df, ndarray_data = self.prepare_inputs(inputs) 30 | processed_df, processed_ndarray_data = self.process_input( 31 | input_df, ndarray_data) 32 | 33 | return self.build_summary(processed_df, processed_ndarray_data) 34 | 35 | def build_pipeline(self): 36 | self.build_dependency_graph() 37 | self._pipeline = [] 38 | for node in nx.topological_sort(self._dependency_graph): 39 | if node in self._annotators: 40 | self._pipeline.append(self._annotators[node]) 41 | if self.verbose: 42 | print "{} ({}) build".format(self.__class__.__name__, 43 | self._annotators[node].name(self)) 44 | self._annotators[node].build(self) 45 | 46 | def prepare_inputs(self, inputs, ndarray_data=None): 47 | 48 | requires = set() 49 | returns = set() 50 | ndarray_requires = set() 51 | ndarray_returns = set() 52 | 53 | for ann in self._pipeline: 54 | requires.update(ann.requires(self)) 55 | returns.update(ann.returns(self)) 56 | ndarray_requires.update(ann.ndarray_requires(self)) 57 | ndarray_returns.update(ann.ndarray_returns(self)) 58 | 59 | # Allocate keys for ndarray dependencies. 60 | if ndarray_data is None: 61 | ndarray_data = {} 62 | for key in ndarray_requires.union(ndarray_returns): 63 | if key not in ndarray_data: 64 | ndarray_data[key] = None 65 | 66 | # Allocate columns for dataframe data dependencies. 67 | all_cols = list(requires.union(returns)) 68 | if isinstance(inputs, list) or isinstance(inputs, tuple): 69 | df = pd.DataFrame([{"doc id": doc_id, "doc text": doc_text} 70 | for doc_id, doc_text in enumerate(inputs)], 71 | columns=["doc id"] + all_cols) 72 | return df, ndarray_data 73 | 74 | elif isinstance(inputs, pd.DataFrame): 75 | if "doc id" not in inputs: 76 | raise Exception("input DataFrame must have column 'doc id'") 77 | 78 | cols = list(set(inputs.columns.tolist() + all_cols)) 79 | df = pd.DataFrame(inputs.to_dict(), columns=cols) 80 | df.reset_index(inplace=True) 81 | return df, ndarray_data 82 | else: 83 | raise Exception("Bad input: list of strings or dataframe only.") 84 | 85 | def process_input(self, input_df, ndarray_data): 86 | cols = set(input_df.columns.tolist()) 87 | for ann in self._pipeline: 88 | 89 | for rtype in ann.returns(self): 90 | assert rtype in cols 91 | 92 | for req in ann.requires(self): 93 | assert req in cols 94 | 95 | run_stage = input_df[ann.returns(self)].isnull().any().any() \ 96 | or np.any([ndarray_data[rtype] is None 97 | for rtype in ann.ndarray_returns(self)]) 98 | 99 | if run_stage: 100 | 101 | if self.verbose: 102 | print "{} ({}) process".format( 103 | self.__class__.__name__, ann.name(self)) 104 | 105 | input_df, ndarray_data = ann.process( 106 | self, input_df, ndarray_data) 107 | 108 | return input_df, ndarray_data 109 | 110 | def build_dependency_graph(self): 111 | G = nx.DiGraph() 112 | self._annotators = {} 113 | 114 | def check_mixins(clazz, visited=set()): 115 | if not issubclass(clazz, _SystemBase): 116 | if issubclass(clazz, _AnnotatorBase): 117 | name = clazz.name(self) 118 | self._annotators[name] = clazz 119 | for req in clazz.requires(self): 120 | G.add_edge(req, name) 121 | for req in clazz.ndarray_requires(self): 122 | G.add_edge(req, name) 123 | 124 | for rtype in clazz.returns(self): 125 | G.add_edge(name, rtype) 126 | for rtype in clazz.ndarray_returns(self): 127 | G.add_edge(name, rtype) 128 | 129 | visited.add(clazz) 130 | for base in clazz.__bases__: 131 | if base in visited: 132 | continue 133 | if not issubclass(base, _AnnotatorBase): 134 | continue 135 | if base == _AnnotatorBase: 136 | continue 137 | check_mixins(base, visited) 138 | 139 | check_mixins(self.__class__) 140 | self._dependency_graph = G 141 | 142 | def print_dependency_graph(self, filename=None, to_iPython=True): 143 | import pygraphviz as pgv 144 | if not hasattr(self, "_dependency_graph") or \ 145 | self._dependency_graph is None: 146 | self.build_dependency_graph() 147 | 148 | if filename is None: 149 | filename = "sumpy.tmp.png" 150 | 151 | G = pgv.AGraph(strict=False, directed=True) 152 | for node in self._dependency_graph: 153 | if node in self._annotators: 154 | G.add_node(node) 155 | G.get_node(node).attr["shape"] ="rectangle" 156 | elif node.startswith("f:"): 157 | G.add_node(node) 158 | G.get_node(node).attr["shape"] ="parallelogram" 159 | for edge in self._dependency_graph.in_edges(node): 160 | G.add_edge(edge[0], edge[1], color="green") 161 | else: 162 | for in_edge in self._dependency_graph.in_edges(node): 163 | for out_edge in self._dependency_graph.out_edges(node): 164 | G.add_edge(in_edge[0], out_edge[1], 165 | label=node, key=node) 166 | 167 | G.layout("dot") 168 | G.draw(filename) 169 | if to_iPython is True: 170 | from IPython.display import Image 171 | return Image(filename=filename) 172 | 173 | class AverageFeatureRankerBase( 174 | WordTokenizerMixin, _SystemBase): 175 | 176 | def build_summary(self, input_df, ndarray_data): 177 | cols = [f for f in input_df.columns.tolist() if f.startswith("f:")] 178 | X = input_df[cols].values 179 | input_df["rank"] = (X / X.max(axis=0)).mean(axis=1) 180 | output_df = input_df.sort(["rank"], ascending=False) 181 | return Summary(output_df) 182 | -------------------------------------------------------------------------------- /sumpy/annotators/_submodular.py: -------------------------------------------------------------------------------- 1 | from sumpy.annotators import WordTokenizerMixin, TfIdfCosineSimilarityMixin 2 | import numpy as np 3 | 4 | 5 | class SubmodularMMRMixin(TfIdfCosineSimilarityMixin): 6 | 7 | def build(self): 8 | if not hasattr(self, "lam"): 9 | self.lam = .3 10 | assert 0 <= self.lam 11 | 12 | if not hasattr(self, "scale"): 13 | self.scale = 1.0 14 | assert 0 <= self.scale 15 | 16 | if not hasattr(self, "budget_type"): 17 | self.budget_type = "word" 18 | assert self.budget_type in ["word", "byte"] 19 | 20 | if not hasattr(self, "budget_size"): 21 | self.budget_size = 400 22 | assert 0 < self.budget_size 23 | 24 | def rank(input_df, ndarray_data): 25 | if self.budget_type == "word": 26 | B = np.array(ndarray_data["RawBOWMatrix"].sum(axis=1)) 27 | print type(B) 28 | elif self.budget_type == "byte": 29 | B = input_df["sent text"].apply(lambda x: len(x.replace("\n", ""))).values 30 | K = ndarray_data["TfIdfCosSimMatrix"] 31 | K = np.ma.masked_array(K, mask=np.diag(np.diag(K))) 32 | assert B.shape[0] == K.shape[0] 33 | 34 | #B = B[[0, 25, 54, 80]] 35 | print B 36 | #K = K[[0, 25, 54, 80]][:,[0, 25, 54, 80]] 37 | print K 38 | K_S = np.ma.masked_array(K, mask=False, hardmask=False) 39 | print K_S 40 | K_V = np.ma.masked_array(K, mask=False, hardmask=False) 41 | print K_V 42 | 43 | print 44 | print 45 | 46 | S = [] 47 | B_S = 0 48 | V = range(K.shape[0]) 49 | inspected_vertices = set() 50 | f_of_S = 0 51 | for rank in xrange(K.shape[0], 0, -1): 52 | #print "K_S" 53 | #print K_S 54 | #print "S" 55 | #print S 56 | #print "V" 57 | #print V 58 | max_gain = float("-inf") 59 | max_idx = None 60 | max_v = None 61 | max_f_of_S_plus_v = None 62 | for i, v in enumerate(V): 63 | if v in inspected_vertices: 64 | continue 65 | S_tmp = S + [v] 66 | V_tmp = V[:i] + V[i+1:] 67 | #print S_tmp 68 | #print V_tmp 69 | #print K[S_tmp][:, V_tmp] 70 | #print K[S_tmp][:, S_tmp].filled(0).sum() 71 | f_of_S_plus_v = K[S_tmp][:, V_tmp].sum() - \ 72 | self.lam * K[S_tmp][:, S_tmp].filled(0).sum() 73 | gain = (f_of_S_plus_v - f_of_S) / (B[v] ** self.scale) 74 | 75 | if gain > max_gain: 76 | max_gain = gain 77 | max_idx = i 78 | max_v = v 79 | max_f_of_S_plus_v = f_of_S_plus_v 80 | #print v, gain 81 | 82 | 83 | #del V[max_idx] 84 | 85 | if max_gain > 0 and B_S + B[max_v] <= self.budget_size: 86 | print "Adding", max_v, "f(S + v) =", max_f_of_S_plus_v 87 | S += [max_v] 88 | del V[max_idx] 89 | f_of_S = max_f_of_S_plus_v 90 | print "B_v", B[max_v], "B_S", B_S, "B_S + B_v", B_S + B[max_v] 91 | B_S += B[max_v] 92 | input_df.ix[max_v, "f:submodular-mmr"] = rank 93 | 94 | inspected_vertices.add(max_v) 95 | 96 | 97 | #else: 98 | 99 | 100 | #print "Iter {} f(S) = {}".format(rank, f_of_S) 101 | #print 102 | #print 103 | #f_cut = K.sum(axis=1) 104 | #print f_cut 105 | #if rank == K.shape[0] - 2: 106 | # break 107 | 108 | return input_df, ndarray_data 109 | self._submodular_mmr = rank 110 | 111 | def process(self, input_df, ndarray_data): 112 | return self._submodular_mmr(input_df, ndarray_data) 113 | 114 | def requires(self): 115 | return ["sent text"] 116 | 117 | def ndarray_requires(self): 118 | return ["TfIdfCosSimMatrix", "RawBOWMatrix"] 119 | 120 | def returns(self): 121 | return ["f:submodular-mmr"] 122 | 123 | def ndarray_returns(self): 124 | return [] 125 | 126 | def name(self): 127 | return "SubmodularMMRMixin" 128 | 129 | class MonotoneSubmodularMixin(WordTokenizerMixin): 130 | def build(self): 131 | if not hasattr(self, "k"): 132 | self.k = 5 133 | assert self.k > 0 134 | 135 | if not hasattr(self, "f_of_A") or self.f_of_A is None: 136 | def f_of_A(system, A, V_min_A, e, input_df, ndarray_input): 137 | return len( 138 | set([word for words in input_df.ix[A, "words"].tolist() for word in words])) 139 | self.f_of_A = f_of_A 140 | 141 | def process(self, input_df, ndarray_data): 142 | 143 | input_size = len(input_df) 144 | S = [] 145 | V_min_S = [i for i in xrange(input_size)] 146 | f_of_S = 0 147 | for i in xrange(self.k): 148 | arg_max = None 149 | gain_max = 0 150 | f_of_S_max = 0 151 | for pos, elem in enumerate(V_min_S): 152 | S_plus_e = S + [elem] 153 | V_min_S_plus_e = V_min_S[:pos] + V_min_S[pos+1:] 154 | score = self.f_of_A( 155 | self, S_plus_e, V_min_S_plus_e, elem, input_df, ndarray_data) 156 | gain = score - f_of_S 157 | 158 | if gain > gain_max: 159 | arg_max = pos 160 | gain_max = gain 161 | f_of_S_max = score 162 | 163 | if arg_max is not None: 164 | S += [V_min_S[arg_max]] 165 | f_of_S = f_of_S_max 166 | del V_min_S[arg_max] 167 | 168 | input_df.ix[S, "f:monotone-submod"] = 1 169 | input_df.ix[V_min_S, "f:monotone-submod"] = 0 170 | 171 | return input_df, ndarray_data 172 | 173 | def process2(self, input_df, ndarray_data): 174 | 175 | input_size = len(input_df) 176 | S = [] 177 | N = set() 178 | 179 | n_of_e = input_df["nuggets"].tolist() 180 | V_min_S = [i for i in xrange(input_size)] 181 | f_of_S = 0 182 | 183 | 184 | for i in xrange(self.k): 185 | arg_max = None 186 | gain_max = 0 187 | for pos, elem in enumerate(V_min_S): 188 | #print "elem", elem 189 | #print "S", S 190 | #print "V_min_S", V_min_S 191 | #print "n(e) =", n_of_e[elem] 192 | n_of_S_U_e = N.union(n_of_e[elem]) 193 | #print "S U {e}", S + [elem] 194 | #print "n(S U {e})", n_of_S_U_e 195 | 196 | gain = self._f_of_S(n_of_S_U_e) - f_of_S 197 | #print "gain", gain 198 | #print 199 | if gain > gain_max: 200 | arg_max = pos 201 | gain_max = gain 202 | 203 | if arg_max is not None: 204 | S = S + [V_min_S[arg_max]] 205 | N = N.union(n_of_e[V_min_S[arg_max]]) 206 | f_of_S = len(N) 207 | 208 | print "ARG MAX", V_min_S[arg_max] 209 | print "S", S 210 | print "N", N 211 | print "f(S)", f_of_S 212 | 213 | del V_min_S[arg_max] 214 | 215 | 216 | print S 217 | print input_df 218 | print input_size 219 | input_df.ix[S, "f:monotone-submod"] = 1 220 | input_df.ix[V_min_S, "f:monotone-submod"] = 0 221 | 222 | 223 | return input_df, ndarray_data 224 | 225 | 226 | def requires(self): 227 | return ["words"] 228 | 229 | def ndarray_requires(self): 230 | return [] 231 | 232 | def returns(self): 233 | return ["f:montone-submod"] 234 | 235 | def ndarray_returns(self): 236 | return [] 237 | 238 | def name(self): 239 | return "MonotoneSubmod" 240 | 241 | 242 | 243 | -------------------------------------------------------------------------------- /sumpy/__init__.py: -------------------------------------------------------------------------------- 1 | import sumpy.io 2 | import sumpy.system 3 | from sumpy.simple import lede, centroid, mmr, textrank, lexrank 4 | 5 | 6 | 7 | #import numpy as np 8 | #from itertools import izip 9 | #import nltk.data 10 | #from nltk.tokenize import WordPunctTokenizer 11 | #from nltk.stem.snowball import EnglishStemmer 12 | #from nltk.corpus import stopwords 13 | #import heapq 14 | #from collections import defaultdict 15 | # 16 | #class DocumentSetReader(object): 17 | # def __init__(self, input=u"filename", preprocessor=None, sentence_processor=None, 18 | # token_processor=None, token_processor_returns=None, stop_filter=None): 19 | # 20 | # if input not in set([u"filename", u"file", u"content"]): 21 | # raise ValueError( 22 | # u"input argument must be 'filename', 'file', or 'content'") 23 | # self.input = input 24 | # 25 | # self.preprocessor = preprocessor 26 | # 27 | # if sentence_processor is None: 28 | # senttok = nltk.data.load('tokenizers/punkt/english.pickle') 29 | # sentence_processor = lambda x: senttok.tokenize(x) 30 | # self.sentence_processor = sentence_processor 31 | # 32 | # if token_processor is None: 33 | # wordtok = WordPunctTokenizer() 34 | # stemmer = EnglishStemmer() 35 | # def default_token_processor(sentence): 36 | # tokens = [[stemmer.stem(word.lower())] 37 | # for word in wordtok.tokenize(sentence)] 38 | # return tokens 39 | # token_processor = default_token_processor 40 | # token_processor_returns = ["token"] 41 | # 42 | # self.token_processor = token_processor 43 | # self.token_processor_returns = token_processor_returns 44 | # 45 | # if stop_filter is None: 46 | # stop = stopwords.words('english') 47 | # stop_filter = lambda token: token in stop or len(token) <= 2 48 | # self.stop_filter = stop_filter 49 | # 50 | # def load_documents(self, documents, names=None): 51 | # max_docs = len(documents) 52 | # if names is None: 53 | # names = ["doc{}".format(n) for n in xrange(max_docs)] 54 | # assert len(names) == len(documents) 55 | # 56 | # sentences = {} 57 | # 58 | # token_type_index = self.token_processor_returns.index(u'token') 59 | # next_sentence_id = 0 60 | # 61 | # for n_doc, (name, document) in enumerate(izip(names, documents)): 62 | # print n_doc 63 | # text = self._read(document) 64 | # for n_sent, sentence in enumerate(self.sentence_processor(text)): 65 | # tokens = {tok_type: list() 66 | # for tok_type in self.token_processor_returns} 67 | # 68 | # for token_types in self.token_processor(sentence): 69 | # if self.stop_filter(token_types[token_type_index]): 70 | # continue 71 | # for tok_type, token in izip( 72 | # self.token_processor_returns, token_types): 73 | # tokens[tok_type].append(token) 74 | # if len(tokens[u'token']) == 0: 75 | # continue 76 | # 77 | # sentences[next_sentence_id] = {u"name": name, 78 | # u"n_doc": n_doc, 79 | # u"n_sent": n_sent, 80 | # u"tokens": tokens, 81 | # u"sentence": sentence} 82 | # next_sentence_id += 1 83 | # return sentences 84 | # 85 | # def _read(self, document): 86 | # 87 | # if self.input == u"filename": 88 | # with open(document, u"r") as f: 89 | # text = ''.join(f.readlines()) 90 | # elif self.input == u"file": 91 | # text = ''.join(document.readlines()) 92 | # elif self.input == u"content": 93 | # text = document 94 | # 95 | # if isinstance(text, str): 96 | # text = text.decode(u"utf-8") 97 | # 98 | # if self.preprocessor is not None: 99 | # text = self.preprocessor(text) 100 | # 101 | # return text 102 | # 103 | #class SentenceRanker(object): 104 | # pass 105 | # 106 | #class SumBasicRanker(SentenceRanker): 107 | # 108 | # def rank(self, summary_input): 109 | # print "RANKING" 110 | # ordered = [] 111 | # unigram_probs = self._build_unigram_probs(summary_input) 112 | # 113 | # heap = [(1-prob, word) for word, prob in unigram_probs.items()] 114 | # heapq.heapify(heap) 115 | # 116 | # weights = [] 117 | # token2sentids = defaultdict(list) 118 | # 119 | # covered = set() 120 | # n_sents = len(summary_input) 121 | # 122 | # print "Debug" 123 | # for sent_id in sorted(summary_input.keys()): 124 | # weight = 0 125 | # length = 0 126 | # print sent_id 127 | # for token in summary_input[sent_id][u'tokens'][u'token']: 128 | # weight += unigram_probs[token] 129 | # token2sentids[token].append(sent_id) 130 | # length += 1 131 | # print u"{}/{}".format(token, weight), 132 | # print 133 | # weight /= float(length) 134 | # print weight 135 | # weights.append(weight) 136 | # 137 | # while len(ordered) != n_sents: 138 | # # Get highest prob word (1) 139 | # prob, word = heapq.heappop(heap) 140 | # 141 | # # Get highest scored sentence containing highest prob word 142 | # sent_ids = token2sentids[word] 143 | # sent_ids.sort(key=lambda x: weights[x]) 144 | # 145 | # for sent_id in sent_ids: 146 | # print sent_id, weights[sent_id] 147 | # print summary_input[sent_id][u'sentence'] 148 | # break 149 | # 150 | # sent_id = sent_ids.pop() 151 | # while sent_id in covered: 152 | # if len(sent_ids) == 0: 153 | # break 154 | # sent_id = sent_ids.pop() 155 | # 156 | # if len(sent_ids) == 0: 157 | # continue 158 | # 159 | # ordered.append(sent_id) 160 | # covered.add(sent_id) 161 | # 162 | # # for sent_id in sent_ids: 163 | # # weights[sent_id] = (1 - prob) 164 | # heapq.heappush(heap, (1 - (1 - prob)**2, word)) 165 | # print word, weights 166 | # print summary_input[sent_id][u'sentence'] 167 | # #for sent_id in sent_ids: 168 | # # print sent_id, weights[sent_id] 169 | # #for prob, word in heapq.heappop(heap) 170 | # #def 171 | # 172 | # def _build_unigram_probs(self, summary_input): 173 | # probs = {} 174 | # total = 0 175 | # for sentence in summary_input.values(): 176 | # for token in sentence[u'tokens'][u'token']: 177 | # probs[token] = probs.get(token, 0) + 1 178 | # total += 1 179 | # 180 | # assert total > 1 181 | # total = float(total) 182 | # for key in probs.keys(): 183 | # probs[key] /= total 184 | # return probs 185 | # 186 | #class PageRank(object): 187 | # 188 | # def __init__(self, max_iters=100, tol=1E-4, d=.85): 189 | # self.max_iters = max_iters 190 | # self.tol = tol 191 | # self.d = d 192 | # 193 | # def rank(self, K): 194 | # n_nodes = K.shape[0] 195 | # r = np.ones((n_nodes, 1), dtype=np.float64) / n_nodes 196 | # #r /= np.sum(r) 197 | # last_r = np.ones((n_nodes, 1)) 198 | # K_hat = (self.d * K) + \ 199 | # (float(1 - self.d) / n_nodes) * np.ones((n_nodes, n_nodes)) 200 | # 201 | # converged = False 202 | # for n_iter in xrange(self.max_iters): 203 | # last_r = r 204 | # r = np.dot(K_hat, r) 205 | # r /= np.sum(r) 206 | # 207 | # if (np.abs(r - last_r) < self.tol).any(): 208 | # converged = True 209 | # break 210 | # 211 | # if not converged: 212 | # print "Warning: PageRank not converged after %d iters" % self.max_iters 213 | # 214 | # return r 215 | # 216 | # 217 | # 218 | #class LexRank(object): 219 | # pass 220 | # 221 | #class TextRank(object): 222 | # 223 | # def summarize(self, text_units): 224 | # pass 225 | # 226 | # def sentence_tokenizer(self): 227 | # pass 228 | # def word_tokenizer(self): 229 | # pass 230 | -------------------------------------------------------------------------------- /sumpy/annotators/_feature_extractors.py: -------------------------------------------------------------------------------- 1 | from sumpy.annotators import (SentenceTokenizerMixin, BinaryBOWMixin, 2 | TfIdfMixin, TfIdfCosineSimilarityMixin) 3 | import numpy as np 4 | from itertools import combinations 5 | 6 | class LedeMixin(SentenceTokenizerMixin): 7 | 8 | def build(self): 9 | pass 10 | 11 | def process(self, input_df, ndarray_data): 12 | input_df[u"f:lede"] = 0 13 | for doc_id, group in input_df.groupby("doc id"): 14 | idx = group["sent id"].argmin() 15 | input_df.ix[idx, u"f:lede"] = 1 16 | return input_df, ndarray_data 17 | 18 | def requires(self): 19 | return ["sent id",] 20 | 21 | def ndarray_requires(self): 22 | return [] 23 | 24 | def returns(self): 25 | return ["f:lede"] 26 | 27 | def ndarray_returns(self): 28 | return [] 29 | 30 | def name(self): 31 | return "LedeMixin" 32 | 33 | class TextRankMixin(BinaryBOWMixin): 34 | 35 | def build(self): 36 | if not hasattr(self, "directed"): 37 | self.directed = u"undirected" 38 | assert self.directed in ["undirected",] # [u"directed", "undirected"] 39 | # TODO actually implement directed 40 | 41 | if not hasattr(self, "d"): 42 | self.d = .85 43 | assert 0 < self.d and self.d < 1 44 | 45 | if not hasattr(self, "max_iters"): 46 | self.max_iters = 20 47 | assert isinstance(self.max_iters, int) and self.max_iters > 0 48 | 49 | if not hasattr(self, "tol"): 50 | self.tol = .0001 51 | assert 0 < self.tol 52 | 53 | def textrank(input_df, ndarray_data): 54 | max_sents = input_df.shape[0] 55 | l = input_df["words"].apply(len).tolist() 56 | K = self._textrank_kernel( 57 | l, ndarray_data["BinaryBOWMatrix"], directed=self.directed) 58 | M_hat = (self.d * K) + \ 59 | (float(1 - self.d) / max_sents) * np.ones( 60 | (max_sents, max_sents)) 61 | M_hat /= np.sum(M_hat, axis=0) 62 | r = np.ones((max_sents), dtype=np.float64) / max_sents 63 | 64 | converged = False 65 | for n_iter in xrange(self.max_iters): 66 | last_r = r 67 | r = np.dot(M_hat, r) 68 | 69 | if (np.abs(r - last_r) < self.tol).any(): 70 | converged = True 71 | break 72 | 73 | if not converged: 74 | print "warning:", 75 | print "textrank failed to converged after {} iters".format( 76 | self.max_iters) 77 | input_df["f:textrank"] = r 78 | return input_df, ndarray_data 79 | self._textrank = textrank 80 | 81 | def process(self, input_df, ndarray_data): 82 | return self._textrank(input_df, ndarray_data) 83 | 84 | def _textrank_kernel(self, l, X, directed=u"undirected"): 85 | """Compute similarity matrix K ala text rank paper. Should this be 86 | a ufunc???""" 87 | #X = X.todense() 88 | #X[X > 0] = 1 89 | N = X.dot(X.T) 90 | 91 | n_sents = X.shape[0] 92 | M = np.zeros((n_sents, n_sents), dtype=np.float64) 93 | for i, j in combinations(xrange(n_sents), 2): 94 | #s_i = word_sets[i] 95 | #s_j = word_sets[j] 96 | val = N[i,j] #len(s_i.intersection(s_j)) 97 | val /= np.log(l[i] * l[j]) 98 | M[i,j] = val 99 | M[j,i] = val 100 | return M 101 | 102 | def requires(self): 103 | return ["words",] 104 | 105 | def ndarray_requires(self): 106 | return ["BinaryBOWMatrix",] 107 | 108 | def returns(self): 109 | return ["f:textrank"] 110 | 111 | def ndarray_returns(self): 112 | return [] 113 | 114 | def name(self): 115 | return "TextRankMixin" 116 | 117 | class LexRankMixin(TfIdfCosineSimilarityMixin): 118 | 119 | def build(self): 120 | if not hasattr(self, "d"): 121 | self.d = .85 122 | assert 0 < self.d and self.d < 1 123 | 124 | if not hasattr(self, "max_iters"): 125 | self.max_iters = 20 126 | assert isinstance(self.max_iters, int) and self.max_iters > 0 127 | 128 | if not hasattr(self, "tol"): 129 | self.tol = .0001 130 | assert 0 < self.tol 131 | 132 | def lexrank(input_df, ndarray_data): 133 | max_sents = input_df.shape[0] 134 | #l = input_df["words"].apply(len).tolist() 135 | K = ndarray_data["TfIdfCosSimMatrix"] 136 | M_hat = (self.d * K) + \ 137 | (float(1 - self.d) / max_sents) * np.ones( 138 | (max_sents, max_sents)) 139 | M_hat /= np.sum(M_hat, axis=0) 140 | r = np.ones((max_sents), dtype=np.float64) / max_sents 141 | 142 | converged = False 143 | for n_iter in xrange(self.max_iters): 144 | last_r = r 145 | r = np.dot(M_hat, r) 146 | 147 | if (np.abs(r - last_r) < self.tol).any(): 148 | converged = True 149 | break 150 | 151 | if not converged: 152 | print "warning:", 153 | print "lexrank failed to converged after {} iters".format( 154 | self.max_iters) 155 | input_df["f:lexrank"] = r 156 | return input_df, ndarray_data 157 | self._lexrank = lexrank 158 | 159 | def process(self, input_df, ndarray_data): 160 | return self._lexrank(input_df, ndarray_data) 161 | 162 | def requires(self): 163 | return [] 164 | 165 | def ndarray_requires(self): 166 | return ["TfIdfCosSimMatrix",] 167 | 168 | def returns(self): 169 | return ["f:lexrank"] 170 | 171 | def ndarray_returns(self): 172 | return [] 173 | 174 | def name(self): 175 | return "LexRankMixin" 176 | 177 | class CentroidMixin(TfIdfMixin, BinaryBOWMixin): 178 | 179 | def build(self): 180 | pass 181 | 182 | def process(self, input_df, ndarray_data): 183 | B = ndarray_data["BinaryBOWMatrix"] 184 | X = ndarray_data["TfIdfMatrix"] 185 | c = X.sum(axis=0) 186 | assert c.shape[1] == X.shape[1] 187 | input_df["f:centroid"] = B.dot(c.T) 188 | return input_df, ndarray_data 189 | 190 | def requires(self): 191 | return [] 192 | 193 | def ndarray_requires(self): 194 | return ["TfIdfMatrix", "BinaryBOWMatrix"] 195 | 196 | def returns(self): 197 | return ["f:centroid"] 198 | 199 | def ndarray_returns(self): 200 | return [] 201 | 202 | def name(self): 203 | return "CentroidMixin" 204 | 205 | class MMRMixin(TfIdfCosineSimilarityMixin): 206 | 207 | def build(self): 208 | if not hasattr(self, "lam"): 209 | self.lam = .7 210 | assert 0 < self.lam and self.lam < 1 211 | 212 | def rank(input_df, ndarray_data): 213 | K = ndarray_data["TfIdfCosSimMatrix"] 214 | K = np.ma.masked_array(K, mask=np.diag(np.diag(K))) 215 | K_input = np.ma.masked_array( 216 | K, mask=False, fill_value=0, hardmask=False) 217 | K_summ = np.ma.masked_array( 218 | K, mask=True, fill_value=0, hardmask=False) 219 | 220 | w1 = self.lam 221 | w2 = (1 - w1) 222 | for rank in range(K.shape[0], 0, -1): 223 | if rank == K.shape[0]: 224 | K_input_max = K_input.max(axis=1).filled(float("-inf")) 225 | idx = np.argmax(K_input_max) 226 | else: 227 | K_summ_max = K_summ.max(axis=1).filled(0) 228 | K_input_max = K_input.max(axis=1).filled(float("inf")) 229 | 230 | S = w1 * K_summ_max - w2 * K_input_max 231 | idx = np.argmax(S) 232 | 233 | K_summ.mask[:,idx] = False 234 | K_summ.mask[idx, idx] = True 235 | K_input.mask[idx,:] = True 236 | 237 | input_df.ix[idx, "f:mmr"] = rank 238 | 239 | return input_df, ndarray_data 240 | self._mmr = rank 241 | 242 | def process(self, input_df, ndarray_data): 243 | return self._mmr(input_df, ndarray_data) 244 | 245 | def requires(self): 246 | return [] 247 | 248 | def ndarray_requires(self): 249 | return ["TfIdfCosSimMatrix"] 250 | 251 | def returns(self): 252 | return ["f:mmr"] 253 | 254 | def ndarray_returns(self): 255 | return [] 256 | 257 | def name(self): 258 | return "MMRMixin" 259 | -------------------------------------------------------------------------------- /sumpy/data/duc07_task2.json: -------------------------------------------------------------------------------- 1 | { 2 | "D0703A": { 3 | "A": { 4 | "inputs": [ 5 | ], 6 | "models": [ 7 | "D0703-A.M.100.A.A", 8 | "D0703-A.M.100.A.C", 9 | "D0703-A.M.100.A.D", 10 | "D0703-A.M.100.A.J", 11 | ], 12 | }, 13 | "B": { 14 | "inputs": [ 15 | ], 16 | "models": [ 17 | "D0703-B.M.100.A.A", 18 | "D0703-B.M.100.A.C", 19 | "D0703-B.M.100.A.D", 20 | "D0703-B.M.100.A.J", 21 | ], 22 | }, 23 | "C": { 24 | "inputs": [ 25 | ], 26 | "models": [ 27 | "D0703-C.M.100.A.A", 28 | "D0703-C.M.100.A.C", 29 | "D0703-C.M.100.A.D", 30 | "D0703-C.M.100.A.J", 31 | ], 32 | }, 33 | }, 34 | "D0706B": { 35 | "A": { 36 | "inputs": [ 37 | ], 38 | "models": [ 39 | "D0706-A.M.100.B.B", 40 | "D0706-A.M.100.B.D", 41 | "D0706-A.M.100.B.E", 42 | "D0706-A.M.100.B.I", 43 | ], 44 | }, 45 | "B": { 46 | "inputs": [ 47 | ], 48 | "models": [ 49 | "D0706-B.M.100.B.B", 50 | "D0706-B.M.100.B.D", 51 | "D0706-B.M.100.B.E", 52 | "D0706-B.M.100.B.I", 53 | ], 54 | }, 55 | "C": { 56 | "inputs": [ 57 | ], 58 | "models": [ 59 | "D0706-C.M.100.B.B", 60 | "D0706-C.M.100.B.D", 61 | "D0706-C.M.100.B.E", 62 | "D0706-C.M.100.B.I", 63 | ], 64 | }, 65 | }, 66 | "D0711C": { 67 | "A": { 68 | "inputs": [ 69 | ], 70 | "models": [ 71 | "D0711-A.M.100.C.A", 72 | "D0711-A.M.100.C.B", 73 | "D0711-A.M.100.C.C", 74 | "D0711-A.M.100.C.F", 75 | ], 76 | }, 77 | "B": { 78 | "inputs": [ 79 | ], 80 | "models": [ 81 | "D0711-B.M.100.C.A", 82 | "D0711-B.M.100.C.B", 83 | "D0711-B.M.100.C.C", 84 | "D0711-B.M.100.C.F", 85 | ], 86 | }, 87 | "C": { 88 | "inputs": [ 89 | ], 90 | "models": [ 91 | "D0711-C.M.100.C.A", 92 | "D0711-C.M.100.C.B", 93 | "D0711-C.M.100.C.C", 94 | "D0711-C.M.100.C.F", 95 | ], 96 | }, 97 | }, 98 | "D0716D": { 99 | "A": { 100 | "inputs": [ 101 | ], 102 | "models": [ 103 | "D0716-A.M.100.D.C", 104 | "D0716-A.M.100.D.D", 105 | "D0716-A.M.100.D.E", 106 | "D0716-A.M.100.D.F", 107 | ], 108 | }, 109 | "B": { 110 | "inputs": [ 111 | ], 112 | "models": [ 113 | "D0716-B.M.100.D.C", 114 | "D0716-B.M.100.D.D", 115 | "D0716-B.M.100.D.E", 116 | "D0716-B.M.100.D.F", 117 | ], 118 | }, 119 | "C": { 120 | "inputs": [ 121 | ], 122 | "models": [ 123 | "D0716-C.M.100.D.C", 124 | "D0716-C.M.100.D.D", 125 | "D0716-C.M.100.D.E", 126 | "D0716-C.M.100.D.F", 127 | ], 128 | }, 129 | }, 130 | "D0721E": { 131 | "A": { 132 | "inputs": [ 133 | ], 134 | "models": [ 135 | "D0721-A.M.100.E.B", 136 | "D0721-A.M.100.E.C", 137 | "D0721-A.M.100.E.E", 138 | "D0721-A.M.100.E.G", 139 | ], 140 | }, 141 | "B": { 142 | "inputs": [ 143 | ], 144 | "models": [ 145 | "D0721-B.M.100.E.B", 146 | "D0721-B.M.100.E.C", 147 | "D0721-B.M.100.E.E", 148 | "D0721-B.M.100.E.G", 149 | ], 150 | }, 151 | "C": { 152 | "inputs": [ 153 | ], 154 | "models": [ 155 | "D0721-C.M.100.E.B", 156 | "D0721-C.M.100.E.C", 157 | "D0721-C.M.100.E.E", 158 | "D0721-C.M.100.E.G", 159 | ], 160 | }, 161 | }, 162 | "D0726F": { 163 | "A": { 164 | "inputs": [ 165 | ], 166 | "models": [ 167 | "D0726-A.M.100.F.A", 168 | "D0726-A.M.100.F.E", 169 | "D0726-A.M.100.F.F", 170 | "D0726-A.M.100.F.G", 171 | ], 172 | }, 173 | "B": { 174 | "inputs": [ 175 | ], 176 | "models": [ 177 | "D0726-B.M.100.F.A", 178 | "D0726-B.M.100.F.E", 179 | "D0726-B.M.100.F.F", 180 | "D0726-B.M.100.F.G", 181 | ], 182 | }, 183 | "C": { 184 | "inputs": [ 185 | ], 186 | "models": [ 187 | "D0726-C.M.100.F.A", 188 | "D0726-C.M.100.F.E", 189 | "D0726-C.M.100.F.F", 190 | "D0726-C.M.100.F.G", 191 | ], 192 | }, 193 | }, 194 | "D0727G": { 195 | "A": { 196 | "inputs": [ 197 | ], 198 | "models": [ 199 | "D0727-A.M.100.G.A", 200 | "D0727-A.M.100.G.F", 201 | "D0727-A.M.100.G.G", 202 | "D0727-A.M.100.G.H", 203 | ], 204 | }, 205 | "B": { 206 | "inputs": [ 207 | ], 208 | "models": [ 209 | "D0727-B.M.100.G.A", 210 | "D0727-B.M.100.G.F", 211 | "D0727-B.M.100.G.G", 212 | "D0727-B.M.100.G.H", 213 | ], 214 | }, 215 | "C": { 216 | "inputs": [ 217 | ], 218 | "models": [ 219 | "D0727-C.M.100.G.A", 220 | "D0727-C.M.100.G.F", 221 | "D0727-C.M.100.G.G", 222 | "D0727-C.M.100.G.H", 223 | ], 224 | }, 225 | }, 226 | "D0736H": { 227 | "A": { 228 | "inputs": [ 229 | ], 230 | "models": [ 231 | "D0736-A.M.100.H.G", 232 | "D0736-A.M.100.H.H", 233 | "D0736-A.M.100.H.I", 234 | "D0736-A.M.100.H.J", 235 | ], 236 | }, 237 | "B": { 238 | "inputs": [ 239 | ], 240 | "models": [ 241 | "D0736-B.M.100.H.G", 242 | "D0736-B.M.100.H.H", 243 | "D0736-B.M.100.H.I", 244 | "D0736-B.M.100.H.J", 245 | ], 246 | }, 247 | "C": { 248 | "inputs": [ 249 | ], 250 | "models": [ 251 | "D0736-C.M.100.H.G", 252 | "D0736-C.M.100.H.H", 253 | "D0736-C.M.100.H.I", 254 | "D0736-C.M.100.H.J", 255 | ], 256 | }, 257 | }, 258 | "D0740I": { 259 | "A": { 260 | "inputs": [ 261 | ], 262 | "models": [ 263 | "D0740-A.M.100.I.D", 264 | "D0740-A.M.100.I.H", 265 | "D0740-A.M.100.I.I", 266 | "D0740-A.M.100.I.J", 267 | ], 268 | }, 269 | "B": { 270 | "inputs": [ 271 | ], 272 | "models": [ 273 | "D0740-B.M.100.I.D", 274 | "D0740-B.M.100.I.H", 275 | "D0740-B.M.100.I.I", 276 | "D0740-B.M.100.I.J", 277 | ], 278 | }, 279 | "C": { 280 | "inputs": [ 281 | ], 282 | "models": [ 283 | "D0740-C.M.100.I.D", 284 | "D0740-C.M.100.I.H", 285 | "D0740-C.M.100.I.I", 286 | "D0740-C.M.100.I.J", 287 | ], 288 | }, 289 | }, 290 | "D0743J": { 291 | "A": { 292 | "inputs": [ 293 | ], 294 | "models": [ 295 | "D0743-A.M.100.J.B", 296 | "D0743-A.M.100.J.H", 297 | "D0743-A.M.100.J.I", 298 | "D0743-A.M.100.J.J", 299 | ], 300 | }, 301 | "B": { 302 | "inputs": [ 303 | ], 304 | "models": [ 305 | "D0743-B.M.100.J.B", 306 | "D0743-B.M.100.J.H", 307 | "D0743-B.M.100.J.I", 308 | "D0743-B.M.100.J.J", 309 | ], 310 | }, 311 | "C": { 312 | "inputs": [ 313 | ], 314 | "models": [ 315 | "D0743-C.M.100.J.B", 316 | "D0743-C.M.100.J.H", 317 | "D0743-C.M.100.J.I", 318 | "D0743-C.M.100.J.J", 319 | ], 320 | }, 321 | }, 322 | } 323 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /sumpy/data/duc03_task2.json: -------------------------------------------------------------------------------- 1 | { 2 | "D30003": { 3 | "inputs": [ 4 | "APW19981017.0151", 5 | "NYT19981017.0177", 6 | "APW19981017.0306", 7 | "APW19981017.0477", 8 | "APW19981017.0507", 9 | "NYT19981018.0098", 10 | "NYT19981018.0123", 11 | "NYT19981018.0160", 12 | "NYT19981018.0185", 13 | "APW19981018.0410" 14 | ], 15 | "models": [ 16 | "D30003.M.100.T.B", 17 | "D30003.M.100.T.C", 18 | "D30003.M.100.T.D", 19 | "D30003.M.100.T.E" 20 | ] 21 | }, 22 | "D30005": { 23 | "inputs": [ 24 | "NYT19981007.0383", 25 | "NYT19981007.0384", 26 | "NYT19981016.0283", 27 | "NYT19981017.0140", 28 | "NYT19981021.0378", 29 | "NYT19981022.0453", 30 | "NYT19981104.0491", 31 | "APW19981104.0772", 32 | "APW19981105.0282", 33 | "NYT19981110.0432" 34 | ], 35 | "models": [ 36 | "D30005.M.100.T.G", 37 | "D30005.M.100.T.H", 38 | "D30005.M.100.T.I", 39 | "D30005.M.100.T.J" 40 | ] 41 | }, 42 | "D30010": { 43 | "inputs": [ 44 | "NYT19981106.0468", 45 | "NYT19981106.0494", 46 | "APW19981106.0520", 47 | "APW19981106.0572", 48 | "APW19981106.0851", 49 | "APW19981106.1126", 50 | "APW19981107.0116", 51 | "APW19981107.0118", 52 | "APW19981107.0131", 53 | "APW19981107.0143" 54 | ], 55 | "models": [ 56 | "D30010.M.100.T.F", 57 | "D30010.M.100.T.G", 58 | "D30010.M.100.T.H", 59 | "D30010.M.100.T.I" 60 | ] 61 | }, 62 | "D30012": { 63 | "inputs": [ 64 | "NYT19981012.0242", 65 | "NYT19981012.0245", 66 | "NYT19981115.0065", 67 | "NYT19981115.0091", 68 | "NYT19981116.0199", 69 | "NYT19981117.0290", 70 | "APW19981117.0597", 71 | "APW19981117.0914", 72 | "APW19981117.1226", 73 | "NYT19981118.0287" 74 | ], 75 | "models": [ 76 | "D30012.M.100.T.A", 77 | "D30012.M.100.T.B", 78 | "D30012.M.100.T.C", 79 | "D30012.M.100.T.D" 80 | ] 81 | }, 82 | "D30016": { 83 | "inputs": [ 84 | "NYT19981001.0440", 85 | "APW19981006.0543", 86 | "APW19981027.1075", 87 | "APW19981027.1082", 88 | "NYT19981029.0472", 89 | "APW19981105.0853", 90 | "NYT19981112.0477", 91 | "NYT19981118.0185", 92 | "APW19981122.0381", 93 | "APW19981122.0382" 94 | ], 95 | "models": [ 96 | "D30016.M.100.T.A", 97 | "D30016.M.100.T.B", 98 | "D30016.M.100.T.C", 99 | "D30016.M.100.T.J" 100 | ] 101 | }, 102 | "D30020": { 103 | "inputs": [ 104 | "APW19981103.0271", 105 | "APW19981116.0496", 106 | "APW19981119.0262", 107 | "APW19981125.0256", 108 | "APW19981202.0568", 109 | "APW19981204.0252", 110 | "APW19981206.0169", 111 | "APW19981206.0201", 112 | "APW19981206.0390", 113 | "APW19981206.0393" 114 | ], 115 | "models": [ 116 | "D30020.M.100.T.E", 117 | "D30020.M.100.T.F", 118 | "D30020.M.100.T.G", 119 | "D30020.M.100.T.H" 120 | ] 121 | }, 122 | "D30025": { 123 | "inputs": [ 124 | "NYT19981004.0131", 125 | "NYT19981004.0152", 126 | "NYT19981005.0331", 127 | "APW19981005.1097", 128 | "NYT19981006.0468", 129 | "APW19981006.1099", 130 | "NYT19981025.0180", 131 | "APW19981025.0395", 132 | "NYT19981110.0322", 133 | "APW19981231.0565" 134 | ], 135 | "models": [ 136 | "D30025.M.100.T.D", 137 | "D30025.M.100.T.E", 138 | "D30025.M.100.T.F", 139 | "D30025.M.100.T.G" 140 | ] 141 | }, 142 | "D30028": { 143 | "inputs": [ 144 | "NYT19981003.0058", 145 | "APW19981003.0487", 146 | "APW19981003.0645", 147 | "APW19981003.0646", 148 | "APW19981003.0741", 149 | "NYT19981004.0069", 150 | "APW19981004.0146", 151 | "APW19981004.0172", 152 | "APW19981004.0175", 153 | "APW19981004.0180" 154 | ], 155 | "models": [ 156 | "D30028.M.100.T.C", 157 | "D30028.M.100.T.D", 158 | "D30028.M.100.T.E", 159 | "D30028.M.100.T.F" 160 | ] 161 | }, 162 | "D30034": { 163 | "inputs": [ 164 | "NYT19981029.0389", 165 | "APW19981101.0202", 166 | "APW19981104.0812", 167 | "APW19981106.0542", 168 | "APW19981106.0551", 169 | "APW19981111.0598", 170 | "APW19981112.0549", 171 | "APW19981115.0219", 172 | "APW19981120.0282", 173 | "NYT19981124.0353" 174 | ], 175 | "models": [ 176 | "D30034.M.100.T.A", 177 | "D30034.M.100.T.B", 178 | "D30034.M.100.T.I", 179 | "D30034.M.100.T.J" 180 | ] 181 | }, 182 | "D30040": { 183 | "inputs": [ 184 | "APW19981124.0254", 185 | "APW19981124.0256", 186 | "NYT19981124.0267", 187 | "APW19981205.0220", 188 | "NYT19981206.0110", 189 | "NYT19981206.0144", 190 | "APW19981229.0756", 191 | "APW19981229.0763", 192 | "APW19981230.0983", 193 | "APW19981230.0991" 194 | ], 195 | "models": [ 196 | "D30040.M.100.T.B", 197 | "D30040.M.100.T.C", 198 | "D30040.M.100.T.D", 199 | "D30040.M.100.T.E" 200 | ] 201 | }, 202 | "D30042": { 203 | "inputs": [ 204 | "APW19981020.1108", 205 | "NYT19981021.0303", 206 | "APW19981028.0445", 207 | "NYT19981031.0088", 208 | "APW19981123.1112", 209 | "APW19981123.1153", 210 | "APW19981125.0279", 211 | "APW19981125.0886", 212 | "APW19981125.0903", 213 | "APW19981129.0652" 214 | ], 215 | "models": [ 216 | "D30042.M.100.T.A", 217 | "D30042.M.100.T.B", 218 | "D30042.M.100.T.C", 219 | "D30042.M.100.T.D" 220 | ] 221 | }, 222 | "D30044": { 223 | "inputs": [ 224 | "NYT19981021.0318", 225 | "APW19981021.0554", 226 | "APW19981104.0265", 227 | "APW19981104.0525", 228 | "APW19981106.1119", 229 | "APW19981110.0230", 230 | "APW19981113.0541", 231 | "APW19981113.0895", 232 | "APW19981113.0896", 233 | "APW19981114.0178" 234 | ], 235 | "models": [ 236 | "D30044.M.100.T.A", 237 | "D30044.M.100.T.H", 238 | "D30044.M.100.T.I", 239 | "D30044.M.100.T.J" 240 | ] 241 | }, 242 | "D30048": { 243 | "inputs": [ 244 | "NYT19981001.0351", 245 | "NYT19981003.0074", 246 | "APW19981003.0705", 247 | "NYT19981004.0131", 248 | "NYT19981004.0152", 249 | "NYT19981005.0331", 250 | "NYT19981005.0365", 251 | "NYT19981006.0468", 252 | "APW19981014.0523", 253 | "NYT19981016.0286" 254 | ], 255 | "models": [ 256 | "D30048.M.100.T.G", 257 | "D30048.M.100.T.H", 258 | "D30048.M.100.T.I", 259 | "D30048.M.100.T.J" 260 | ] 261 | }, 262 | "D30050": { 263 | "inputs": [ 264 | "NYT19981003.0061", 265 | "NYT19981004.0121", 266 | "NYT19981004.0125", 267 | "NYT19981005.0379", 268 | "NYT19981005.0445", 269 | "NYT19981007.0352", 270 | "NYT19981007.0353", 271 | "NYT19981007.0355", 272 | "NYT19981007.0395", 273 | "NYT19981008.0467" 274 | ], 275 | "models": [ 276 | "D30050.M.100.T.A", 277 | "D30050.M.100.T.B", 278 | "D30050.M.100.T.C", 279 | "D30050.M.100.T.J" 280 | ] 281 | }, 282 | "D30051": { 283 | "inputs": [ 284 | "NYT19981001.0377", 285 | "APW19981012.0791", 286 | "APW19981014.0564", 287 | "APW19981016.0667", 288 | "APW19981021.0246", 289 | "APW19981029.0281", 290 | "APW19981104.0524", 291 | "APW19981104.0537", 292 | "APW19981105.0609", 293 | "APW19981107.0700" 294 | ], 295 | "models": [ 296 | "D30051.M.100.T.F", 297 | "D30051.M.100.T.G", 298 | "D30051.M.100.T.H", 299 | "D30051.M.100.T.I" 300 | ] 301 | }, 302 | "D30056": { 303 | "inputs": [ 304 | "APW19981004.0717", 305 | "APW19981005.0718", 306 | "APW19981014.0284", 307 | "APW19981026.0225", 308 | "APW19981208.0286", 309 | "NYT19981208.0294", 310 | "APW19981208.0313", 311 | "APW19981208.0315", 312 | "APW19981208.0876" 313 | ], 314 | "models": [ 315 | "D30056.M.100.T.E", 316 | "D30056.M.100.T.F", 317 | "D30056.M.100.T.G", 318 | "D30056.M.100.T.H" 319 | ] 320 | }, 321 | "D31001": { 322 | "inputs": [ 323 | "APW19981008.0841", 324 | "APW19981026.0485", 325 | "APW19981026.0787", 326 | "APW19981028.0231", 327 | "NYT19981028.0331", 328 | "NYT19981029.0366", 329 | "NYT19981031.0150", 330 | "APW19981031.0742", 331 | "NYT19981107.0056", 332 | "NYT19981107.0057" 333 | ], 334 | "models": [ 335 | "D31001.M.100.T.D", 336 | "D31001.M.100.T.E", 337 | "D31001.M.100.T.F", 338 | "D31001.M.100.T.G" 339 | ] 340 | }, 341 | "D31002": { 342 | "inputs": [ 343 | "NYT19981003.0082", 344 | "APW19981003.0170", 345 | "APW19981003.0180", 346 | "NYT19981003.0187", 347 | "APW19981003.0470", 348 | "APW19981003.0473", 349 | "APW19981003.0492", 350 | "NYT19981004.0056", 351 | "APW19981004.0165", 352 | "APW19981004.0171" 353 | ], 354 | "models": [ 355 | "D31002.M.100.T.A", 356 | "D31002.M.100.T.B", 357 | "D31002.M.100.T.I", 358 | "D31002.M.100.T.J" 359 | ] 360 | }, 361 | "D31009": { 362 | "inputs": [ 363 | "NYT19981125.0347", 364 | "APW19981125.0544", 365 | "APW19981125.0898", 366 | "APW19981126.0707", 367 | "APW19981126.0971", 368 | "APW19981126.1022", 369 | "APW19981129.0435", 370 | "APW19981129.0625", 371 | "APW19981130.0508", 372 | "APW19981202.0281" 373 | ], 374 | "models": [ 375 | "D31009.M.100.T.A", 376 | "D31009.M.100.T.H", 377 | "D31009.M.100.T.I", 378 | "D31009.M.100.T.J" 379 | ] 380 | }, 381 | "D31010": { 382 | "inputs": [ 383 | "APW19981123.0259", 384 | "APW19981123.0532", 385 | "APW19981123.1118", 386 | "APW19981125.0278", 387 | "NYT19981125.0289", 388 | "APW19981130.0222", 389 | "APW19981203.0965", 390 | "APW19981203.0970", 391 | "APW19981205.0353", 392 | "APW19981205.0560" 393 | ], 394 | "models": [ 395 | "D31010.M.100.T.C", 396 | "D31010.M.100.T.D", 397 | "D31010.M.100.T.E", 398 | "D31010.M.100.T.F" 399 | ] 400 | }, 401 | "D31011": { 402 | "inputs": [ 403 | "NYT19981022.0367", 404 | "APW19981023.0551", 405 | "APW19981024.0164", 406 | "APW19981024.0343", 407 | "APW19981025.0231", 408 | "APW19981025.0412", 409 | "APW19981025.0922", 410 | "APW19981029.0560", 411 | "APW19981104.0507", 412 | "APW19981112.0305" 413 | ], 414 | "models": [ 415 | "D31011.M.100.T.G", 416 | "D31011.M.100.T.H", 417 | "D31011.M.100.T.I", 418 | "D31011.M.100.T.J" 419 | ] 420 | }, 421 | "D31013": { 422 | "inputs": [ 423 | "NYT19981024.0136", 424 | "NYT19981024.0193", 425 | "NYT19981025.0178", 426 | "NYT19981025.0186", 427 | "NYT19981025.0188", 428 | "NYT19981025.0236", 429 | "NYT19981025.0239", 430 | "NYT19981025.0249", 431 | "NYT19981026.0446", 432 | "NYT19981027.0421" 433 | ], 434 | "models": [ 435 | "D31013.M.100.T.B", 436 | "D31013.M.100.T.C", 437 | "D31013.M.100.T.D", 438 | "D31013.M.100.T.E" 439 | ] 440 | }, 441 | "D31022": { 442 | "inputs": [ 443 | "NYT19981030.0329", 444 | "APW19981030.1037", 445 | "APW19981030.1041", 446 | "APW19981030.1046", 447 | "APW19981030.1066", 448 | "APW19981031.0314", 449 | "APW19981031.0551", 450 | "APW19981101.0536", 451 | "APW19981101.0556", 452 | "APW19981102.0190" 453 | ], 454 | "models": [ 455 | "D31022.M.100.T.F", 456 | "D31022.M.100.T.G", 457 | "D31022.M.100.T.H", 458 | "D31022.M.100.T.I" 459 | ] 460 | }, 461 | "D31027": { 462 | "inputs": [ 463 | "APW19981018.0638", 464 | "APW19981022.0848", 465 | "APW19981023.0519", 466 | "APW19981023.1147", 467 | "APW19981024.0182", 468 | "APW19981024.0186", 469 | "APW19981025.0209", 470 | "APW19981025.0210", 471 | "APW19981025.0218", 472 | "APW19981025.0234" 473 | ], 474 | "models": [ 475 | "D31027.M.100.T.A", 476 | "D31027.M.100.T.B", 477 | "D31027.M.100.T.C", 478 | "D31027.M.100.T.D" 479 | ] 480 | }, 481 | "D31028": { 482 | "inputs": [ 483 | "NYT19981001.0271", 484 | "APW19981003.0141", 485 | "APW19981003.0142", 486 | "NYT19981004.0072", 487 | "APW19981004.0574", 488 | "APW19981005.1108", 489 | "NYT19981020.0178", 490 | "NYT19981026.0341", 491 | "APW19981031.0317", 492 | "NYT19981107.0072" 493 | ], 494 | "models": [ 495 | "D31028.M.100.T.A", 496 | "D31028.M.100.T.B", 497 | "D31028.M.100.T.C", 498 | "D31028.M.100.T.J" 499 | ] 500 | }, 501 | "D31031": { 502 | "inputs": [ 503 | "NYT19981005.0441", 504 | "NYT19981006.0047", 505 | "NYT19981006.0127", 506 | "NYT19981006.0391", 507 | "NYT19981006.0397", 508 | "NYT19981007.0399", 509 | "NYT19981008.0412", 510 | "NYT19981009.0452", 511 | "NYT19981009.0476", 512 | "NYT19981011.0181" 513 | ], 514 | "models": [ 515 | "D31031.M.100.T.E", 516 | "D31031.M.100.T.F", 517 | "D31031.M.100.T.G", 518 | "D31031.M.100.T.H" 519 | ] 520 | }, 521 | "D31033": { 522 | "inputs": [ 523 | "NYT19981018.0102", 524 | "NYT19981019.0284", 525 | "NYT19981019.0476", 526 | "NYT19981020.0315", 527 | "NYT19981020.0345", 528 | "NYT19981021.0064", 529 | "NYT19981021.0066", 530 | "NYT19981021.0400", 531 | "NYT19981022.0507", 532 | "NYT19981023.0251" 533 | ], 534 | "models": [ 535 | "D31033.M.100.T.D", 536 | "D31033.M.100.T.E", 537 | "D31033.M.100.T.F", 538 | "D31033.M.100.T.G" 539 | ] 540 | }, 541 | "D31038": { 542 | "inputs": [ 543 | "APW19981002.0556", 544 | "NYT19981003.0093", 545 | "APW19981003.0517", 546 | "NYT19981005.0386", 547 | "NYT19981005.0454", 548 | "NYT19981007.0383", 549 | "NYT19981016.0283", 550 | "NYT19981017.0140", 551 | "NYT19981021.0378", 552 | "NYT19981022.0453" 553 | ], 554 | "models": [ 555 | "D31038.M.100.T.A", 556 | "D31038.M.100.T.B", 557 | "D31038.M.100.T.I", 558 | "D31038.M.100.T.J" 559 | ] 560 | }, 561 | "D31041": { 562 | "inputs": [ 563 | "APW19981003.0184", 564 | "APW19981010.0163", 565 | "APW19981110.0245", 566 | "APW19981122.0379", 567 | "APW19981123.0257", 568 | "APW19981124.0261", 569 | "APW19981202.0265", 570 | "APW19981203.0309", 571 | "NYT19981218.0224" 572 | ], 573 | "models": [ 574 | "D31041.M.100.T.A", 575 | "D31041.M.100.T.H", 576 | "D31041.M.100.T.I", 577 | "D31041.M.100.T.J" 578 | ] 579 | }, 580 | "D31050": { 581 | "inputs": [ 582 | "NYT19981202.0309", 583 | "APW19981202.1274", 584 | "APW19981203.0338", 585 | "NYT19981207.0280", 586 | "NYT19981209.0542", 587 | "NYT19981216.0357", 588 | "APW19981216.0666", 589 | "NYT19981217.0274", 590 | "NYT19981218.0250", 591 | "APW19981220.0155" 592 | ], 593 | "models": [ 594 | "D31050.M.100.T.C", 595 | "D31050.M.100.T.D", 596 | "D31050.M.100.T.E", 597 | "D31050.M.100.T.F" 598 | ] 599 | } 600 | } 601 | -------------------------------------------------------------------------------- /sumpy/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tarfile 4 | import re 5 | from datetime import datetime 6 | import corenlp as cnlp 7 | import json 8 | import pkg_resources 9 | 10 | class DUCHelper(object): 11 | 12 | def __init__(self, duc_path=None, sumpy_data_path=None): 13 | if duc_path is None: 14 | duc_path = os.getenv("DUC_DATA", "~/DUC") 15 | self.duc_path = duc_path 16 | if sumpy_data_path is None: 17 | self.sumpy_data_path = os.getenv("SUMPY_DATA", 18 | os.path.join( 19 | os.path.expanduser("~"), ".sumpy")) 20 | 21 | def docset_iter(self, year, task): 22 | 23 | if year == 2003: 24 | if task == 2: 25 | duc_json_path = pkg_resources.resource_filename( 26 | "sumpy", os.path.join("data", "duc03_task2.json")) 27 | with open(duc_json_path, "r") as f: 28 | docsets = json.load(f, strict=False) 29 | 30 | docset_ids = sorted(docsets.keys()) 31 | for docset_id in docset_ids: 32 | ds = DUCDocset( 33 | docset_id, 2003, 2, 34 | docsets[docset_id]["inputs"], 35 | os.path.join( 36 | self.sumpy_data_path, "duc2003", "task2", 37 | docset_id, "inputs"), 38 | docsets[docset_id]["models"], 39 | os.path.join( 40 | self.sumpy_data_path, "duc2003", "task2", 41 | docset_id, "models")) 42 | 43 | yield ds 44 | 45 | 46 | elif year == 2004: 47 | if task == 2: 48 | duc_json_path = pkg_resources.resource_filename( 49 | "sumpy", os.path.join("data", "duc04_task2.json")) 50 | with open(duc_json_path, "r") as f: 51 | docsets = json.load(f, strict=False) 52 | 53 | docset_ids = sorted(docsets.keys()) 54 | for docset_id in docset_ids: 55 | ds = DUCDocset( 56 | docset_id, 2004, 2, 57 | docsets[docset_id]["inputs"], 58 | os.path.join( 59 | self.sumpy_data_path, "duc2004", "task2", 60 | docset_id, "inputs"), 61 | docsets[docset_id]["models"], 62 | os.path.join( 63 | self.sumpy_data_path, "duc2004", "task2", 64 | docset_id, "models")) 65 | 66 | yield ds 67 | 68 | # elif year == 2007: 69 | # if task == 2: 70 | # for docset_id in self.duc07_task2_docset_ids: 71 | # dsA = DUCDocset( 72 | # docset_id, 2007, 2, 73 | # self.duc07_task2[docset_id]["A"]["inputs"], 74 | # os.path.join(self.duc07_task2_docsets_path, 75 | # "{}-A".format(docset_id)), 76 | # self.duc07_task2[docset_id]["A"]["models"], 77 | # os.path.join(self.duc07_task2_models_path)) 78 | # dsB = DUCDocset( 79 | # docset_id, 2007, 2, 80 | # self.duc07_task2[docset_id]["B"]["inputs"], 81 | # os.path.join(self.duc07_task2_docsets_path, 82 | # "{}-B".format(docset_id)), 83 | # self.duc07_task2[docset_id]["B"]["models"], 84 | # os.path.join(self.duc07_task2_models_path)) 85 | # dsC = DUCDocset( 86 | # docset_id, 2007, 2, 87 | # self.duc07_task2[docset_id]["C"]["inputs"], 88 | # os.path.join(self.duc07_task2_docsets_path, 89 | # "{}-C".format(docset_id)), 90 | # self.duc07_task2[docset_id]["C"]["models"], 91 | # os.path.join(self.duc07_task2_models_path)) 92 | # 93 | # ds = DUCUpdateDocset( 94 | # docset_id, year, task, [dsA, dsB, dsC]) 95 | # yield ds 96 | 97 | else: 98 | raise Exception("Bad argument: year is {}".format(year)) 99 | 100 | def docsets(self, year, task): 101 | if year == 2003: 102 | if task == 2: 103 | return DUCDocsets([ds for ds in self.docset_iter(2003, 2)]) 104 | else: 105 | raise Exception("Bad argument: task is {}".format(task)) 106 | elif year == 2004: 107 | if task == 2: 108 | return DUCDocsets([ds for ds in self.docset_iter(2004, 2)]) 109 | else: 110 | raise Exception("Bad argument: task is {}".format(task)) 111 | else: 112 | raise Exception("Bad argument: year is {}".format(year)) 113 | 114 | def install(self, year, task): 115 | if year == 2001: 116 | raise Exception("Not implemented!") 117 | elif year == 2002: 118 | raise Exception("Not Implemented!") 119 | elif year == 2003: 120 | self._install_duc03_task2() 121 | elif year == 2004: 122 | self._install_duc04_task2() 123 | else: 124 | raise Exception("Not implemented!") 125 | 126 | def _install_duc03_task2(self): 127 | data_path = os.path.join(self.sumpy_data_path, "duc2003", "task2") 128 | if not os.path.exists(data_path): 129 | os.makedirs(data_path) 130 | data_path_duc = os.path.join( 131 | self.duc_path, "DUC2003_Summarization_Documents.tgz") 132 | data_path_models = os.path.join( 133 | self.duc_path, "detagged.duc2003.abstracts.tar.gz") 134 | 135 | if not os.path.exists(data_path_duc): 136 | raise Exception("{} does not exist. " \ 137 | "Please obtain this file from NIST.".format( 138 | data_path_duc)) 139 | if not os.path.exists(data_path_models): 140 | raise Exception("{} does not exist. " \ 141 | "Please obtain this file from NIST.".format( 142 | data_path_models)) 143 | 144 | 145 | docsets = {} 146 | 147 | docs_tar = os.path.join("DUC2003_Summarization_Documents", 148 | "duc2003_testdata", "task2", "task2.docs.tar.gz") 149 | with tarfile.open(name=data_path_duc, mode="r") as tf: 150 | for m in tf.getmembers(): 151 | if m.name == docs_tar: 152 | break 153 | 154 | f = tf.extractfile(m) 155 | from StringIO import StringIO 156 | b = StringIO(f.read()) 157 | with tarfile.open(fileobj=b, mode="r") as dtf: 158 | for m in dtf.getmembers(): 159 | path, doc_id = os.path.split(m.name) 160 | _, docset_id = os.path.split(path) 161 | text = dtf.extractfile(m).read() 162 | docset_id = docset_id.upper()[:-1] 163 | docset = docsets.get( 164 | docset_id, {"inputs": [], "models": []}) 165 | docset["inputs"].append({"input id": doc_id, "text": text}) 166 | docsets[docset_id] = docset 167 | with tarfile.open(name=data_path_models, mode="r") as tf: 168 | for m in tf.getmembers(): 169 | path, model = os.path.split(m.name) 170 | if os.path.split(path)[1] == "models": 171 | if re.search(r'D\d{5}\.\w\.100\.\w\.\w.html', model): 172 | docset_id = model.split(".")[0] 173 | model_id = os.path.splitext(model)[0] 174 | text = tf.extractfile(m).read() 175 | docsets[docset_id]["models"].append( 176 | {"model id": model_id, 177 | "text": text}) 178 | 179 | #annotators=["tokenize", "ssplit"] 180 | #with cnlp.Server(annotators=annotators) as pipeline: 181 | for docset_id, docset in docsets.items(): 182 | inputs_path = os.path.join(data_path, docset_id, "inputs") 183 | if not os.path.exists(inputs_path): 184 | os.makedirs(inputs_path) 185 | for input in docset["inputs"]: 186 | input_path = os.path.join(inputs_path, input["input id"]) 187 | with open(input_path, "wb") as f: 188 | f.write(input["text"]) 189 | 190 | models_path = os.path.join(data_path, docset_id, "models") 191 | if not os.path.exists(models_path): 192 | os.makedirs(models_path) 193 | for model in docset["models"]: 194 | model_path = os.path.join(models_path, model["model id"]) 195 | #doc = pipeline.annotate(model["text"]) 196 | 197 | with open(model_path, "wb") as f: 198 | f.write(model["text"]) 199 | #for sent in doc: 200 | # line = " ".join([str(tok) for tok in sent]) + "\n" 201 | # f.write(line) 202 | 203 | 204 | def _install_duc04_task2(self): 205 | data_path = os.path.join(self.sumpy_data_path, "duc2004", "task2") 206 | if not os.path.exists(data_path): 207 | os.makedirs(data_path) 208 | data_path_duc = os.path.join( 209 | self.duc_path, "DUC2004_Summarization_Documents.tgz") 210 | data_path_models = os.path.join( 211 | self.duc_path, "duc2004_results.tgz") 212 | 213 | if not os.path.exists(data_path_duc): 214 | raise Exception("{} does not exist. " \ 215 | "Please obtain this file from NIST.".format( 216 | data_path_duc)) 217 | if not os.path.exists(data_path_models): 218 | raise Exception("{} does not exist. " \ 219 | "Please obtain this file from NIST.".format( 220 | data_path_models)) 221 | 222 | docsets = {} 223 | tgt_path = os.path.join("DUC2004_Summarization_Documents", 224 | "duc2004_testdata", "tasks1and2", "duc2004_tasks1and2_docs", 225 | "docs") 226 | with tarfile.open(name=data_path_duc, mode="r") as tf: 227 | for m in tf.getmembers(): 228 | path, doc_id = os.path.split(m.name) 229 | path, docset_id = os.path.split(path) 230 | if path == tgt_path: 231 | docset_id = docset_id.upper()[:-1] 232 | text = tf.extractfile(m).read() 233 | docset = docsets.get( 234 | docset_id, {"inputs": [], "models": []}) 235 | docset["inputs"].append({"input id": doc_id, "text": text}) 236 | docsets[docset_id] = docset 237 | tgt_path = os.path.join("duc2004_results", "ROUGE", 238 | "duc2004.task2.ROUGE.models.tar.gz") 239 | with tarfile.open(name=data_path_models, mode="r") as tf: 240 | for m in tf.getmembers(): 241 | if m.name == tgt_path: 242 | break 243 | models_tar = tf.extractfile(m) 244 | with tarfile.open(fileobj=models_tar, mode="r") as mtf: 245 | for m in mtf.getmembers(): 246 | model_id = os.path.split(m.name)[1] 247 | docset_id = model_id.split(".")[0] 248 | text = mtf.extractfile(m).read() 249 | docsets[docset_id]["models"].append( 250 | {"model id": model_id, 251 | "text": text}) 252 | 253 | #annotators=["tokenize", "ssplit"] 254 | #with cnlp.Server(annotators=annotators) as pipeline: 255 | for docset_id, docset in docsets.items(): 256 | inputs_path = os.path.join(data_path, docset_id, "inputs") 257 | if not os.path.exists(inputs_path): 258 | os.makedirs(inputs_path) 259 | for input in docset["inputs"]: 260 | input_path = os.path.join(inputs_path, input["input id"]) 261 | with open(input_path, "wb") as f: 262 | f.write(input["text"]) 263 | 264 | models_path = os.path.join(data_path, docset_id, "models") 265 | if not os.path.exists(models_path): 266 | os.makedirs(models_path) 267 | for model in docset["models"]: 268 | model_path = os.path.join(models_path, model["model id"]) 269 | 270 | #doc = pipeline.annotate(model["text"]) 271 | 272 | with open(model_path, "wb") as f: 273 | f.write(model["text"]) 274 | #for sent in doc: 275 | # line = " ".join([str(tok) for tok in sent]) + "\n" 276 | # f.write(line) 277 | 278 | 279 | 280 | 281 | # def _install_duc01_task2(self): 282 | # 283 | # data_path = os.path.join(self.sumpy_data_path, "duc2001", "task2") 284 | # if not os.path.exists(data_path): 285 | # os.makedirs(data_path) 286 | # data_path_duc = os.path.join( 287 | # self.duc_path, "DUC2001_Summarization_Documents.tgz") 288 | # 289 | # if not os.path.exists(data_path_duc): 290 | # raise Exception("{} does not exist. " \ 291 | # "Please obtain this file from NIST.".format( 292 | # data_path_duc)) 293 | # 294 | # docments_tar_path = os.path.join("DUC2001_Summarization_Documents", 295 | # "data", "testtraining", "Duc2001testtraining.tar.gz") 296 | # 297 | # with tarfile.open(name=data_path_duc, mode="r") as tf: 298 | # mem_documents_tar = [m for m in tf.getmembers() 299 | # if m.name == docments_tar_path] 300 | # tf.extractall(members=mem_documents_tar) 301 | # documents_tar_path = os.path.join( 302 | # "DUC2001_Summarization_Documents", "data", "testtraining", 303 | # "Duc2001testtraining.tar.gz") 304 | # 305 | # if not os.path.exists(documents_tar_path): 306 | # raise Exception("Failed to extract DUC 2001 documents!") 307 | # 308 | # with tarfile.open(docments_tar_path, mode="r") as tf: 309 | # tf.extractall() 310 | # 311 | # documents_path = "duc2002testtraining" 312 | # if not os.path.exists(documents_path): 313 | # raise Exception("Failed to extract DUC 2001 documents!") 314 | # 315 | # docsets = {} 316 | # for docset_id in os.listdir(documents_path): 317 | # docset_path = os.path.join(documents_path, docset_id) 318 | # articles = [] 319 | # for article_name in os.listdir(docset_path): 320 | # if article_name.startswith("ap"): 321 | # year = 1900 + int(article_name[2:4]) 322 | # month = int(article_name[4:6]) 323 | # day = int(article_name[6:8]) 324 | # ts = datetime(year, month, day) 325 | # elif article_name.startswith("wsj"): 326 | # year = 1900 + int(article_name[3:5]) 327 | # month = int(article_name[5:7]) 328 | # day = int(article_name[7:9]) 329 | # ts = datetime(year, month, day) 330 | # elif article_name.startswith("la"): 331 | # year = 1900 + int(article_name[6:8]) 332 | # month = int(article_name[2:4]) 333 | # day = int(article_name[4:6]) 334 | # ts = datetime(year, month, day) 335 | # elif article_name.startswith("ft"): 336 | # year = 1900 + int(article_name[2:4]) 337 | # month = int(article_name.split("-")[0][4:]) 338 | # ts = datetime(year, month, 1) 339 | # elif article_name.startswith("fbis"): 340 | # ts = datetime(1977,1,1) 341 | # elif article_name.startswith("sjmn"): 342 | # ts = datetime(91,1,1) 343 | # else: 344 | # raise Exception("Found unsual file here {}".format( 345 | # article_name)) 346 | # print article_name, ts 347 | # article_path = os.path.join( 348 | # docset_path, article_name, "{}.body".format(article_name)) 349 | # with open(article_path, "rb") as f: 350 | # content = f.read() 351 | # articles.append({"input id": article_name, 352 | # "raw text": content, 353 | # "timestamp": ts}) 354 | # docsets[docset_id] = articles 355 | # 356 | # shutil.rmtree("DUC2001_Summarization_Documents") 357 | # shutil.rmtree(documents_path) 358 | 359 | class DUCDocsets(object): 360 | def __init__(self, docsets): 361 | self._docsets = {ds.docset_id: ds for ds in docsets} 362 | 363 | def __getitem__(self, ds_id): 364 | return self._docsets[ds_id] 365 | 366 | class DUCDocset(object): 367 | def __init__(self, docset_id, year, task, inputs, input_root, 368 | models, model_root): 369 | self.docset_id = docset_id 370 | self.year = year 371 | self.task = task 372 | self.inputs = inputs 373 | self.input_root = input_root 374 | self.models = models 375 | self.model_root = model_root 376 | 377 | def __str__(self): 378 | return "DUCDocset({}, {}, {}, {} inputs, {}, {} models, {})".format( 379 | self.docset_id, self.year, self.task, len(self.inputs), 380 | self.input_root[:10] + "...", len(self.models), 381 | self.model_root[:10] + "...") 382 | 383 | def input_iter(self): 384 | for doc_id in self.inputs: 385 | timestamp_t = int(doc_id[3:7]), int(doc_id[7:9]), int(doc_id[9:11]) 386 | timestamp = datetime(*timestamp_t) 387 | 388 | yield DUCDocument( 389 | doc_id, timestamp, os.path.join(self.input_root, doc_id)) 390 | 391 | def model_iter(self): 392 | for doc_id in self.models: 393 | yield DUCModel(doc_id, os.path.join(self.model_root, doc_id)) 394 | 395 | class DUCUpdateDocset(object): 396 | def __init__(self, docset_id, year, task, docsets): 397 | self.docset_id = docset_id 398 | self.year = year 399 | self.task = task 400 | self.docsets = docsets 401 | 402 | def update_iter(self): 403 | for update_ds in self.docsets: 404 | yield update_ds 405 | 406 | class DUCDocument(object): 407 | def __init__(self, doc_id, timestamp, path): 408 | self.doc_id = doc_id 409 | self.timestamp = timestamp 410 | self.path = path 411 | self._text = None 412 | 413 | def _read(self): 414 | if os.path.exists(self.path): 415 | with open(self.path, "rb") as f: 416 | self._text = f.read() 417 | else: 418 | raise Exception("DUCDocument {} not found at path {}".format( 419 | self.doc_id, self.path)) 420 | 421 | def __str__(self): 422 | if self._text is None: 423 | self._read() 424 | return self._text 425 | 426 | def __unicode__(self): 427 | if self._text is None: 428 | self._read() 429 | return self._text.decode("utf-8") 430 | 431 | def __bytes__(self): 432 | if self._text is None: 433 | self._read() 434 | return self._text 435 | 436 | class DUCModel(object): 437 | def __init__(self, doc_id, path): 438 | self.doc_id = doc_id 439 | self.path = path 440 | self._text = None 441 | 442 | def _read(self): 443 | if os.path.exists(self.path): 444 | with open(self.path, "rb") as f: 445 | self._text = f.read() 446 | else: 447 | raise Exception("DUCModel {} not found at path {}".format( 448 | self.doc_id, self.path)) 449 | 450 | def __str__(self): 451 | if self._text is None: 452 | self._read() 453 | return self._text 454 | 455 | def __unicode__(self): 456 | if self._text is None: 457 | self._read() 458 | return self._text.decode("utf-8") 459 | 460 | def __bytes__(self): 461 | if self._text is None: 462 | self._read() 463 | return self._text 464 | -------------------------------------------------------------------------------- /sumpy/data/duc04_task2.json: -------------------------------------------------------------------------------- 1 | { 2 | "D30001": { 3 | "inputs": [ 4 | "APW19981016.0240", 5 | "APW19981022.0269", 6 | "APW19981026.0220", 7 | "APW19981027.0491", 8 | "APW19981031.0167", 9 | "APW19981113.0251", 10 | "APW19981116.0205", 11 | "APW19981118.0276", 12 | "APW19981120.0274", 13 | "APW19981124.0267" 14 | ], 15 | "models": [ 16 | "D30001.M.100.T.A", 17 | "D30001.M.100.T.B", 18 | "D30001.M.100.T.C", 19 | "D30001.M.100.T.D" 20 | ] 21 | }, 22 | "D30002": { 23 | "inputs": [ 24 | "APW19981027.0241", 25 | "APW19981028.1120", 26 | "APW19981029.0570", 27 | "APW19981031.0720", 28 | "APW19981101.0843", 29 | "APW19981102.0737", 30 | "APW19981103.0526", 31 | "APW19981104.0539", 32 | "APW19981105.1220", 33 | "APW19981106.0869" 34 | ], 35 | "models": [ 36 | "D30002.M.100.T.A", 37 | "D30002.M.100.T.B", 38 | "D30002.M.100.T.C", 39 | "D30002.M.100.T.E" 40 | ] 41 | }, 42 | "D30003": { 43 | "inputs": [ 44 | "APW19981018.0423", 45 | "APW19981019.0098", 46 | "APW19981020.0241", 47 | "APW19981021.0557", 48 | "APW19981022.1132", 49 | "APW19981023.1166", 50 | "APW19981024.0192", 51 | "APW19981025.0449", 52 | "NYT19981026.0292", 53 | "APW19981028.0444" 54 | ], 55 | "models": [ 56 | "D30003.M.100.T.A", 57 | "D30003.M.100.T.B", 58 | "D30003.M.100.T.C", 59 | "D30003.M.100.T.F" 60 | ] 61 | }, 62 | "D30005": { 63 | "inputs": [ 64 | "NYT19981003.0093", 65 | "APW19981003.0517", 66 | "APW19981111.0288", 67 | "APW19981112.0551", 68 | "NYT19981113.0410", 69 | "APW19981119.0552", 70 | "APW19981120.0887", 71 | "APW19981129.0665", 72 | "NYT19981201.0444", 73 | "NYT19981202.0428" 74 | ], 75 | "models": [ 76 | "D30005.M.100.T.A", 77 | "D30005.M.100.T.B", 78 | "D30005.M.100.T.C", 79 | "D30005.M.100.T.G" 80 | ] 81 | }, 82 | "D30006": { 83 | "inputs": [ 84 | "NYT19981003.0083", 85 | "NYT19981005.0385", 86 | "NYT19981006.0396", 87 | "NYT19981008.0461", 88 | "NYT19981013.0354", 89 | "NYT19981014.0003", 90 | "NYT19981018.0175", 91 | "APW19981018.0836", 92 | "APW19981018.0888", 93 | "NYT19981021.0014" 94 | ], 95 | "models": [ 96 | "D30006.M.100.T.A", 97 | "D30006.M.100.T.B", 98 | "D30006.M.100.T.C", 99 | "D30006.M.100.T.H" 100 | ] 101 | }, 102 | "D30007": { 103 | "inputs": [ 104 | "APW19981001.0312", 105 | "APW19981002.0522", 106 | "APW19981002.0567", 107 | "APW19981004.0851", 108 | "APW19981006.0556", 109 | "APW19981007.0574", 110 | "APW19981010.0696", 111 | "APW19981011.0515", 112 | "APW19981011.0744", 113 | "APW19981013.0275" 114 | ], 115 | "models": [ 116 | "D30007.M.100.T.A", 117 | "D30007.M.100.T.B", 118 | "D30007.M.100.T.D", 119 | "D30007.M.100.T.E" 120 | ] 121 | }, 122 | "D30008": { 123 | "inputs": [ 124 | "APW19981004.0281", 125 | "APW19981006.0251", 126 | "APW19981013.0853", 127 | "APW19981016.0437", 128 | "APW19981019.0104", 129 | "APW19981023.0281", 130 | "NYT19981105.0538", 131 | "APW19981108.0837", 132 | "APW19981109.0264", 133 | "APW19981109.0274" 134 | ], 135 | "models": [ 136 | "D30008.M.100.T.A", 137 | "D30008.M.100.T.B", 138 | "D30008.M.100.T.D", 139 | "D30008.M.100.T.G" 140 | ] 141 | }, 142 | "D30010": { 143 | "inputs": [ 144 | "APW19981106.0273", 145 | "APW19981106.0274", 146 | "APW19981106.0275", 147 | "APW19981106.0276", 148 | "NYT19981107.0251", 149 | "APW19981107.0568", 150 | "APW19981107.0744", 151 | "APW19981107.0752", 152 | "NYT19981108.0136", 153 | "APW19981108.0188" 154 | ], 155 | "models": [ 156 | "D30010.M.100.T.A", 157 | "D30010.M.100.T.B", 158 | "D30010.M.100.T.D", 159 | "D30010.M.100.T.H" 160 | ] 161 | }, 162 | "D30011": { 163 | "inputs": [ 164 | "APW19981001.0315", 165 | "APW19981002.0550", 166 | "APW19981002.1081", 167 | "NYT19981003.0120", 168 | "APW19981003.0144", 169 | "APW19981004.0281", 170 | "APW19981005.0484", 171 | "APW19981006.0251", 172 | "APW19981008.0259", 173 | "APW19981008.0527" 174 | ], 175 | "models": [ 176 | "D30011.M.100.T.A", 177 | "D30011.M.100.T.B", 178 | "D30011.M.100.T.E", 179 | "D30011.M.100.T.F" 180 | ] 181 | }, 182 | "D30015": { 183 | "inputs": [ 184 | "NYT19981004.0102", 185 | "APW19981005.0205", 186 | "APW19981005.0223", 187 | "APW19981005.0233", 188 | "NYT19981005.0391", 189 | "APW19981005.0496", 190 | "APW19981005.0506", 191 | "APW19981005.0762", 192 | "APW19981005.1072", 193 | "APW19981005.1082" 194 | ], 195 | "models": [ 196 | "D30015.M.100.T.A", 197 | "D30015.M.100.T.B", 198 | "D30015.M.100.T.E", 199 | "D30015.M.100.T.H" 200 | ] 201 | }, 202 | "D30017": { 203 | "inputs": [ 204 | "APW19981010.0187", 205 | "APW19981022.0488", 206 | "APW19981104.0245", 207 | "APW19981110.0240", 208 | "NYT19981114.0099", 209 | "APW19981118.0898", 210 | "APW19981119.0262", 211 | "APW19981124.0251", 212 | "NYT19981209.0451", 213 | "APW19981221.0189" 214 | ], 215 | "models": [ 216 | "D30017.M.100.T.A", 217 | "D30017.M.100.T.B", 218 | "D30017.M.100.T.F", 219 | "D30017.M.100.T.G" 220 | ] 221 | }, 222 | "D30020": { 223 | "inputs": [ 224 | "APW19981002.0557", 225 | "APW19981029.0521", 226 | "APW19981030.1074", 227 | "APW19981102.0220", 228 | "APW19981126.0432", 229 | "APW19981126.0450", 230 | "APW19981128.0168", 231 | "APW19981206.0174", 232 | "APW19981206.0199", 233 | "APW19981206.0379" 234 | ], 235 | "models": [ 236 | "D30020.M.100.T.A", 237 | "D30020.M.100.T.C", 238 | "D30020.M.100.T.D", 239 | "D30020.M.100.T.E" 240 | ] 241 | }, 242 | "D30022": { 243 | "inputs": [ 244 | "APW19981005.0231", 245 | "APW19981015.0177", 246 | "NYT19981202.0309", 247 | "APW19981202.1274", 248 | "APW19981203.0338", 249 | "NYT19981207.0280", 250 | "NYT19981209.0542", 251 | "NYT19981216.0357", 252 | "APW19981216.0666", 253 | "NYT19981217.0274" 254 | ], 255 | "models": [ 256 | "D30022.M.100.T.A", 257 | "D30022.M.100.T.C", 258 | "D30022.M.100.T.D", 259 | "D30022.M.100.T.F" 260 | ] 261 | }, 262 | "D30024": { 263 | "inputs": [ 264 | "NYT19981007.0464", 265 | "NYT19981104.0369", 266 | "NYT19981104.0516", 267 | "NYT19981104.0545", 268 | "NYT19981104.0597", 269 | "NYT19981104.0600", 270 | "NYT19981105.0439", 271 | "NYT19981105.0509", 272 | "NYT19981105.0525", 273 | "NYT19981106.0464" 274 | ], 275 | "models": [ 276 | "D30024.M.100.T.A", 277 | "D30024.M.100.T.C", 278 | "D30024.M.100.T.D", 279 | "D30024.M.100.T.G" 280 | ] 281 | }, 282 | "D30026": { 283 | "inputs": [ 284 | "NYT19981101.0082", 285 | "NYT19981122.0131", 286 | "NYT19981122.0163", 287 | "NYT19981123.0453", 288 | "NYT19981123.0458", 289 | "NYT19981123.0478", 290 | "NYT19981124.0340", 291 | "NYT19981124.0365", 292 | "NYT19981124.0411", 293 | "NYT19981125.0073" 294 | ], 295 | "models": [ 296 | "D30026.M.100.T.A", 297 | "D30026.M.100.T.C", 298 | "D30026.M.100.T.D", 299 | "D30026.M.100.T.H" 300 | ] 301 | }, 302 | "D30027": { 303 | "inputs": [ 304 | "NYT19981001.0363", 305 | "NYT19981001.0379", 306 | "APW19981001.1177", 307 | "NYT19981002.0250", 308 | "NYT19981002.0300", 309 | "APW19981002.0778", 310 | "APW19981002.0783", 311 | "APW19981002.0809", 312 | "APW19981003.0292", 313 | "NYT19981004.0132" 314 | ], 315 | "models": [ 316 | "D30027.M.100.T.A", 317 | "D30027.M.100.T.C", 318 | "D30027.M.100.T.E", 319 | "D30027.M.100.T.G" 320 | ] 321 | }, 322 | "D30028": { 323 | "inputs": [ 324 | "APW19981001.0539", 325 | "APW19981004.0182", 326 | "APW19981004.0296", 327 | "APW19981004.0321", 328 | "APW19981004.0550", 329 | "APW19981005.0236", 330 | "APW19981005.0457", 331 | "APW19981005.0467", 332 | "APW19981005.0474", 333 | "APW19981005.1033" 334 | ], 335 | "models": [ 336 | "D30028.M.100.T.A", 337 | "D30028.M.100.T.C", 338 | "D30028.M.100.T.F", 339 | "D30028.M.100.T.G" 340 | ] 341 | }, 342 | "D30029": { 343 | "inputs": [ 344 | "APW19981224.0814", 345 | "APW19981226.0185", 346 | "APW19981227.0319", 347 | "APW19981227.0766", 348 | "APW19981227.0803", 349 | "APW19981227.0836", 350 | "APW19981227.0840", 351 | "APW19981227.0853", 352 | "APW19981227.0870", 353 | "APW19981228.0467" 354 | ], 355 | "models": [ 356 | "D30029.M.100.T.A", 357 | "D30029.M.100.T.C", 358 | "D30029.M.100.T.F", 359 | "D30029.M.100.T.H" 360 | ] 361 | }, 362 | "D30031": { 363 | "inputs": [ 364 | "NYT19981203.0460", 365 | "NYT19981204.0365", 366 | "NYT19981206.0178", 367 | "APW19981207.0418", 368 | "APW19981207.0577", 369 | "APW19981207.0578", 370 | "APW19981207.0580", 371 | "APW19981207.0581", 372 | "APW19981207.0583", 373 | "NYT19981113.0404" 374 | ], 375 | "models": [ 376 | "D30031.M.100.T.A", 377 | "D30031.M.100.T.D", 378 | "D30031.M.100.T.E", 379 | "D30031.M.100.T.F" 380 | ] 381 | }, 382 | "D30033": { 383 | "inputs": [ 384 | "NYT19981119.0380", 385 | "APW19981127.0244", 386 | "APW19981203.0649", 387 | "APW19981203.1240", 388 | "NYT19981223.0347", 389 | "APW19981228.0189", 390 | "APW19981229.0467", 391 | "APW19981230.0431", 392 | "APW19981230.0473", 393 | "APW19981231.0143" 394 | ], 395 | "models": [ 396 | "D30033.M.100.T.A", 397 | "D30033.M.100.T.D", 398 | "D30033.M.100.T.E", 399 | "D30033.M.100.T.G" 400 | ] 401 | }, 402 | "D30034": { 403 | "inputs": [ 404 | "APW19981124.0554", 405 | "APW19981126.0443", 406 | "APW19981130.0497", 407 | "APW19981205.0792", 408 | "APW19981205.0807", 409 | "APW19981211.0972", 410 | "APW19981211.0982", 411 | "APW19981211.0990", 412 | "APW19981212.0541", 413 | "APW19981221.0448" 414 | ], 415 | "models": [ 416 | "D30034.M.100.T.A", 417 | "D30034.M.100.T.D", 418 | "D30034.M.100.T.F", 419 | "D30034.M.100.T.G" 420 | ] 421 | }, 422 | "D30036": { 423 | "inputs": [ 424 | "APW19981006.0833", 425 | "APW19981007.0823", 426 | "NYT19981008.0338", 427 | "APW19981008.0523", 428 | "APW19981008.1113", 429 | "APW19981009.0788", 430 | "APW19981011.0541", 431 | "APW19981012.0252", 432 | "APW19981012.0267", 433 | "NYT19981012.0334" 434 | ], 435 | "models": [ 436 | "D30036.M.100.T.A", 437 | "D30036.M.100.T.D", 438 | "D30036.M.100.T.F", 439 | "D30036.M.100.T.H" 440 | ] 441 | }, 442 | "D30037": { 443 | "inputs": [ 444 | "APW19981004.0138", 445 | "NYT19981009.0337", 446 | "NYT19981009.0369", 447 | "NYT19981009.0486", 448 | "APW19981009.1040", 449 | "APW19981010.0164", 450 | "APW19981010.0173", 451 | "APW19981010.0374", 452 | "APW19981010.0383", 453 | "APW19981012.0254" 454 | ], 455 | "models": [ 456 | "D30037.M.100.T.A", 457 | "D30037.M.100.T.D", 458 | "D30037.M.100.T.G", 459 | "D30037.M.100.T.H" 460 | ] 461 | }, 462 | "D30038": { 463 | "inputs": [ 464 | "APW19981211.0352", 465 | "APW19981211.1276", 466 | "APW19981211.1288", 467 | "APW19981212.0354", 468 | "APW19981212.0562", 469 | "NYT19981213.0205", 470 | "APW19981213.0396", 471 | "APW19981213.0412", 472 | "APW19981213.0424", 473 | "APW19981213.0720" 474 | ], 475 | "models": [ 476 | "D30038.M.100.T.A", 477 | "D30038.M.100.T.E", 478 | "D30038.M.100.T.F", 479 | "D30038.M.100.T.H" 480 | ] 481 | }, 482 | "D30040": { 483 | "inputs": [ 484 | "APW19981119.1180", 485 | "APW19981119.1227", 486 | "APW19981123.1179", 487 | "APW19981124.0254", 488 | "APW19981124.0256", 489 | "APW19981205.0220", 490 | "APW19981229.0756", 491 | "APW19981229.0763", 492 | "APW19981230.0983", 493 | "APW19981230.0991" 494 | ], 495 | "models": [ 496 | "D30040.M.100.T.A", 497 | "D30040.M.100.T.E", 498 | "D30040.M.100.T.G", 499 | "D30040.M.100.T.H" 500 | ] 501 | }, 502 | "D30042": { 503 | "inputs": [ 504 | "APW19981019.0307", 505 | "APW19981129.0668", 506 | "APW19981202.1230", 507 | "APW19981205.0172", 508 | "APW19981205.0213", 509 | "APW19981206.0364", 510 | "APW19981206.0371", 511 | "APW19981209.1444", 512 | "APW19981212.0848", 513 | "APW19981221.1004" 514 | ], 515 | "models": [ 516 | "D30042.M.100.T.B", 517 | "D30042.M.100.T.C", 518 | "D30042.M.100.T.D", 519 | "D30042.M.100.T.F" 520 | ] 521 | }, 522 | "D30044": { 523 | "inputs": [ 524 | "APW19981114.0575", 525 | "APW19981115.0371", 526 | "APW19981115.0618", 527 | "APW19981115.0626", 528 | "APW19981116.0221", 529 | "APW19981116.0235", 530 | "NYT19981116.0479", 531 | "APW19981116.1120", 532 | "APW19981117.0528", 533 | "APW19981117.0530" 534 | ], 535 | "models": [ 536 | "D30044.M.100.T.B", 537 | "D30044.M.100.T.C", 538 | "D30044.M.100.T.D", 539 | "D30044.M.100.T.G" 540 | ] 541 | }, 542 | "D30045": { 543 | "inputs": [ 544 | "NYT19981125.0417", 545 | "NYT19981125.0433", 546 | "NYT19981126.0192", 547 | "NYT19981127.0203", 548 | "NYT19981127.0240", 549 | "NYT19981127.0256", 550 | "NYT19981127.0264", 551 | "NYT19981127.0289", 552 | "NYT19981127.0293", 553 | "NYT19981129.0113" 554 | ], 555 | "models": [ 556 | "D30045.M.100.T.B", 557 | "D30045.M.100.T.C", 558 | "D30045.M.100.T.E", 559 | "D30045.M.100.T.F" 560 | ] 561 | }, 562 | "D30046": { 563 | "inputs": [ 564 | "NYT19981217.0394", 565 | "NYT19981218.0380", 566 | "NYT19981219.0101", 567 | "NYT19981219.0102", 568 | "NYT19981219.0104", 569 | "NYT19981219.0106", 570 | "NYT19981219.0117", 571 | "NYT19981219.0145", 572 | "NYT19981219.0148", 573 | "NYT19981219.0170" 574 | ], 575 | "models": [ 576 | "D30046.M.100.T.B", 577 | "D30046.M.100.T.C", 578 | "D30046.M.100.T.E", 579 | "D30046.M.100.T.H" 580 | ] 581 | }, 582 | "D30047": { 583 | "inputs": [ 584 | "NYT19981113.0404", 585 | "APW19981119.0252", 586 | "APW19981120.0290", 587 | "NYT19981120.0427", 588 | "APW19981120.0892", 589 | "APW19981121.0727", 590 | "APW19981207.0577", 591 | "APW19981207.0578", 592 | "APW19981207.0580", 593 | "APW19981209.1470" 594 | ], 595 | "models": [ 596 | "D30047.M.100.T.B", 597 | "D30047.M.100.T.C", 598 | "D30047.M.100.T.F", 599 | "D30047.M.100.T.H" 600 | ] 601 | }, 602 | "D30048": { 603 | "inputs": [ 604 | "APW19981016.0655", 605 | "APW19981017.0692", 606 | "APW19981019.0550", 607 | "NYT19981020.0380", 608 | "NYT19981020.0382", 609 | "APW19981020.1106", 610 | "APW19981021.1160", 611 | "APW19981021.1170", 612 | "APW19981022.1123", 613 | "NYT19981024.0050" 614 | ], 615 | "models": [ 616 | "D30048.M.100.T.B", 617 | "D30048.M.100.T.C", 618 | "D30048.M.100.T.G", 619 | "D30048.M.100.T.H" 620 | ] 621 | }, 622 | "D30049": { 623 | "inputs": [ 624 | "APW19981023.0254", 625 | "NYT19981028.0441", 626 | "NYT19981114.0099", 627 | "NYT19981118.0464", 628 | "NYT19981121.0041", 629 | "NYT19981121.0117", 630 | "APW19981121.0131", 631 | "APW19981121.0344", 632 | "NYT19981122.0111", 633 | "APW19981203.0321" 634 | ], 635 | "models": [ 636 | "D30049.M.100.T.B", 637 | "D30049.M.100.T.D", 638 | "D30049.M.100.T.E", 639 | "D30049.M.100.T.G" 640 | ] 641 | }, 642 | "D30050": { 643 | "inputs": [ 644 | "NYT19981002.0309", 645 | "NYT19981002.0366", 646 | "NYT19981005.0479", 647 | "NYT19981006.0394", 648 | "NYT19981008.0387", 649 | "NYT19981008.0453", 650 | "NYT19981009.0434", 651 | "NYT19981010.0027", 652 | "NYT19981010.0149", 653 | "NYT19981010.0151" 654 | ], 655 | "models": [ 656 | "D30050.M.100.T.B", 657 | "D30050.M.100.T.D", 658 | "D30050.M.100.T.E", 659 | "D30050.M.100.T.H" 660 | ] 661 | }, 662 | "D30051": { 663 | "inputs": [ 664 | "APW19981108.0803", 665 | "APW19981111.0631", 666 | "NYT19981114.0079", 667 | "APW19981116.0213", 668 | "APW19981116.0231", 669 | "APW19981116.0525", 670 | "APW19981121.0514", 671 | "APW19981129.0969", 672 | "APW19981130.0803", 673 | "NYT19981202.0391" 674 | ], 675 | "models": [ 676 | "D30051.M.100.T.B", 677 | "D30051.M.100.T.D", 678 | "D30051.M.100.T.F", 679 | "D30051.M.100.T.H" 680 | ] 681 | }, 682 | "D30053": { 683 | "inputs": [ 684 | "NYT19981127.0267", 685 | "APW19981206.0557", 686 | "APW19981206.0559", 687 | "APW19981207.1390", 688 | "APW19981208.0312", 689 | "APW19981210.0305", 690 | "APW19981211.0628", 691 | "APW19981212.0161", 692 | "APW19981213.0224", 693 | "APW19981216.0275" 694 | ], 695 | "models": [ 696 | "D30053.M.100.T.B", 697 | "D30053.M.100.T.E", 698 | "D30053.M.100.T.F", 699 | "D30053.M.100.T.G" 700 | ] 701 | }, 702 | "D30055": { 703 | "inputs": [ 704 | "NYT19981004.0064", 705 | "APW19981004.0320", 706 | "NYT19981005.0306", 707 | "APW19981005.0721", 708 | "APW19981006.0270", 709 | "NYT19981009.0371", 710 | "APW19981009.0494", 711 | "APW19981009.0501", 712 | "APW19981009.0525", 713 | "APW19981012.0281" 714 | ], 715 | "models": [ 716 | "D30055.M.100.T.B", 717 | "D30055.M.100.T.E", 718 | "D30055.M.100.T.F", 719 | "D30055.M.100.T.H" 720 | ] 721 | }, 722 | "D30056": { 723 | "inputs": [ 724 | "APW19981016.0448", 725 | "APW19981208.0906", 726 | "APW19981209.0688", 727 | "APW19981209.1423", 728 | "APW19981209.1425", 729 | "APW19981210.0940", 730 | "APW19981211.1223", 731 | "APW19981212.0189", 732 | "APW19981212.0191", 733 | "APW19981212.0597" 734 | ], 735 | "models": [ 736 | "D30056.M.100.T.B", 737 | "D30056.M.100.T.E", 738 | "D30056.M.100.T.G", 739 | "D30056.M.100.T.H" 740 | ] 741 | }, 742 | "D30059": { 743 | "inputs": [ 744 | "APW19981120.1199", 745 | "APW19981120.1224", 746 | "APW19981120.1237", 747 | "APW19981120.1239", 748 | "APW19981121.0482", 749 | "NYT19981122.0110", 750 | "NYT19981122.0194", 751 | "APW19981122.0610", 752 | "APW19981123.0274", 753 | "APW19981124.0233" 754 | ], 755 | "models": [ 756 | "D30059.M.100.T.B", 757 | "D30059.M.100.T.F", 758 | "D30059.M.100.T.G", 759 | "D30059.M.100.T.H" 760 | ] 761 | }, 762 | "D31001": { 763 | "inputs": [ 764 | "APW19981008.0841", 765 | "APW19981026.0485", 766 | "APW19981026.0787", 767 | "APW19981028.0231", 768 | "NYT19981028.0331", 769 | "NYT19981029.0366", 770 | "NYT19981031.0150", 771 | "APW19981031.0742", 772 | "NYT19981107.0056", 773 | "APW19981109.0767" 774 | ], 775 | "models": [ 776 | "D31001.M.100.T.C", 777 | "D31001.M.100.T.D", 778 | "D31001.M.100.T.E", 779 | "D31001.M.100.T.G" 780 | ] 781 | }, 782 | "D31008": { 783 | "inputs": [ 784 | "NYT19981009.0436", 785 | "NYT19981009.0470", 786 | "NYT19981011.0203", 787 | "NYT19981012.0357", 788 | "NYT19981012.0359", 789 | "NYT19981013.0277", 790 | "NYT19981013.0349", 791 | "NYT19981016.0233", 792 | "NYT19981016.0257", 793 | "NYT19981016.0342" 794 | ], 795 | "models": [ 796 | "D31008.M.100.T.C", 797 | "D31008.M.100.T.D", 798 | "D31008.M.100.T.E", 799 | "D31008.M.100.T.H" 800 | ] 801 | }, 802 | "D31009": { 803 | "inputs": [ 804 | "APW19981111.0309", 805 | "APW19981111.1240", 806 | "APW19981119.0529", 807 | "NYT19981202.0315", 808 | "APW19981202.0880", 809 | "APW19981203.0322", 810 | "APW19981209.0696", 811 | "APW19981219.0504", 812 | "APW19981221.1044", 813 | "APW19981228.0740" 814 | ], 815 | "models": [ 816 | "D31009.M.100.T.B", 817 | "D31009.M.100.T.C", 818 | "D31009.M.100.T.F", 819 | "D31009.M.100.T.G" 820 | ] 821 | }, 822 | "D31013": { 823 | "inputs": [ 824 | "NYT19981102.0465", 825 | "NYT19981104.0619", 826 | "NYT19981104.0623", 827 | "NYT19981105.0521", 828 | "NYT19981106.0565", 829 | "APW19981109.0728", 830 | "NYT19981110.0442", 831 | "NYT19981112.0195", 832 | "NYT19981114.0057", 833 | "NYT19981114.0129" 834 | ], 835 | "models": [ 836 | "D31013.M.100.T.C", 837 | "D31013.M.100.T.D", 838 | "D31013.M.100.T.G", 839 | "D31013.M.100.T.H" 840 | ] 841 | }, 842 | "D31022": { 843 | "inputs": [ 844 | "APW19981030.0230", 845 | "APW19981030.0470", 846 | "APW19981030.0489", 847 | "APW19981030.0792", 848 | "APW19981030.1037", 849 | "APW19981030.1041", 850 | "APW19981030.1046", 851 | "APW19981030.1066", 852 | "APW19981031.0314", 853 | "APW19981031.0551" 854 | ], 855 | "models": [ 856 | "D31022.M.100.T.C", 857 | "D31022.M.100.T.E", 858 | "D31022.M.100.T.F", 859 | "D31022.M.100.T.G" 860 | ] 861 | }, 862 | "D31026": { 863 | "inputs": [ 864 | "NYT19981010.0022", 865 | "NYT19981014.0038", 866 | "NYT19981016.0291", 867 | "NYT19981017.0014", 868 | "NYT19981017.0027", 869 | "NYT19981017.0047", 870 | "NYT19981017.0052", 871 | "NYT19981017.0093", 872 | "NYT19981017.0132", 873 | "NYT19981018.0014" 874 | ], 875 | "models": [ 876 | "D31026.M.100.T.C", 877 | "D31026.M.100.T.E", 878 | "D31026.M.100.T.F", 879 | "D31026.M.100.T.H" 880 | ] 881 | }, 882 | "D31031": { 883 | "inputs": [ 884 | "NYT19981001.0442", 885 | "NYT19981001.0499", 886 | "NYT19981011.0194", 887 | "NYT19981013.0339", 888 | "NYT19981013.0341", 889 | "NYT19981013.0399", 890 | "NYT19981013.0427", 891 | "NYT19981016.0245", 892 | "NYT19981016.0293", 893 | "NYT19981016.0312" 894 | ], 895 | "models": [ 896 | "D31031.M.100.T.C", 897 | "D31031.M.100.T.F", 898 | "D31031.M.100.T.G", 899 | "D31031.M.100.T.H" 900 | ] 901 | }, 902 | "D31032": { 903 | "inputs": [ 904 | "APW19981001.0299", 905 | "APW19981007.0563", 906 | "APW19981011.0535", 907 | "APW19981012.0522", 908 | "APW19981012.1126", 909 | "APW19981013.0282", 910 | "APW19981014.0819", 911 | "APW19981015.0170", 912 | "APW19981015.0569", 913 | "APW19981016.0209" 914 | ], 915 | "models": [ 916 | "D31032.M.100.T.D", 917 | "D31032.M.100.T.E", 918 | "D31032.M.100.T.F", 919 | "D31032.M.100.T.G" 920 | ] 921 | }, 922 | "D31033": { 923 | "inputs": [ 924 | "NYT19981007.0302", 925 | "NYT19981009.0282", 926 | "NYT19981012.0400", 927 | "NYT19981018.0065", 928 | "NYT19981018.0079", 929 | "NYT19981018.0084", 930 | "NYT19981018.0089", 931 | "NYT19981018.0091", 932 | "NYT19981018.0093", 933 | "NYT19981024.0043" 934 | ], 935 | "models": [ 936 | "D31033.M.100.T.D", 937 | "D31033.M.100.T.E", 938 | "D31033.M.100.T.F", 939 | "D31033.M.100.T.H" 940 | ] 941 | }, 942 | "D31038": { 943 | "inputs": [ 944 | "APW19981023.0569", 945 | "NYT19981025.0187", 946 | "NYT19981026.0361", 947 | "NYT19981104.0491", 948 | "APW19981104.0772", 949 | "APW19981105.0282", 950 | "NYT19981110.0432", 951 | "APW19981111.0288", 952 | "APW19981112.0551", 953 | "APW19981120.0887" 954 | ], 955 | "models": [ 956 | "D31038.M.100.T.D", 957 | "D31038.M.100.T.E", 958 | "D31038.M.100.T.G", 959 | "D31038.M.100.T.H" 960 | ] 961 | }, 962 | "D31043": { 963 | "inputs": [ 964 | "APW19981006.0802", 965 | "APW19981007.0567", 966 | "APW19981013.0301", 967 | "APW19981015.0163", 968 | "APW19981015.0167", 969 | "APW19981128.0178", 970 | "APW19981129.0871", 971 | "APW19981129.0896", 972 | "APW19981130.1079", 973 | "APW19981130.1085" 974 | ], 975 | "models": [ 976 | "D31043.M.100.T.D", 977 | "D31043.M.100.T.F", 978 | "D31043.M.100.T.G", 979 | "D31043.M.100.T.H" 980 | ] 981 | }, 982 | "D31050": { 983 | "inputs": [ 984 | "APW19981005.0231", 985 | "APW19981220.0356", 986 | "APW19981220.0578", 987 | "APW19981221.0236", 988 | "NYT19981221.0377", 989 | "APW19981221.0719", 990 | "APW19981221.0757", 991 | "NYT19981222.0021", 992 | "APW19981223.0717", 993 | "APW19981224.0149" 994 | ], 995 | "models": [ 996 | "D31050.M.100.T.E", 997 | "D31050.M.100.T.F", 998 | "D31050.M.100.T.G", 999 | "D31050.M.100.T.H" 1000 | ] 1001 | } 1002 | } 1003 | 1004 | --------------------------------------------------------------------------------