├── .travis.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── context.py
├── getSummary.py
├── requirements.txt
├── setup.py
└── util.py


/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.6"
 4 |   - "2.7"
 5 |   - "3.2"
 6 |   - "3.3"
 7 |   - "3.4"
 8 |   - "3.5"
 9 |   - "3.5-dev" # 3.5 development branch
10 |   - "nightly" # currently points to 3.6-dev
11 | # command to install dependencies
12 | install: "pip install -r requirements.txt"
13 | # # command to run tests
14 | script: nosetests
15 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | This project is licensed under MIT License:
 2 | 
 3 | Copyright (c) 2015-2016: Vipul Sharma
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 
11 | This project uses following external libraries, which have their own licenses:
12 | 
13 | [NLTK](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) [Apache]
14 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the license file
2 | include LICENSE.txt
3 | 
4 | # Include the data files
5 | recursive-include . *
6 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | SUMMRIZER
 2 | =========
 3 | 
 4 | A naive summary script to extract summary of text content
 5 | 
 6 | PLEASE read the blog post. It contains complete info
 7 | 
 8 | * Blog Post: http://www.vipul.xyz/2015/10/summrizer-text-summarizer_27.html 
 9 | 
10 | Testing results against:
11 | 
12 | * http://smmry.com
13 | * http://freesummarizer.com
14 | * http://autosummarizer.com/index.php
15 | 
16 | LICENSE
17 | =======
18 | 
19 | This project is licensed under MIT License:
20 | 
21 |     Copyright (c) 2015-2016: Vipul Sharma
22 | 
23 |     Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
24 | 
25 |     The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
26 | 
27 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 | 
29 | This project uses following external libraries, which have their own licenses:
30 | 
31 | * [NLTK](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) [Apache]
32 | 


--------------------------------------------------------------------------------
/context.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Script to extract important topics from content
  5 | """
  6 | 
  7 | import nltk
  8 | from nltk.corpus import brown
  9 | import util
 10 | 
 11 | 
 12 | train = brown.tagged_sents(categories='news')
 13 | 
 14 | # backoff regex tagging
 15 | regex_tag = nltk.RegexpTagger([
 16 |      (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
 17 |      (r'.*able$', 'JJ'),
 18 |      (r'^[A-Z].*$', 'NNP'),
 19 |      (r'.*ly$', 'RB'),
 20 |      (r'.*s$', 'NNS'),
 21 |      (r'.*ing$', 'VBG'),
 22 |      (r'.*ed$', 'VBD'),
 23 |      (r'.*', 'NN')
 24 | ])
 25 | 
 26 | unigram_tag = nltk.UnigramTagger(train, backoff=regex_tag)
 27 | bigram_tag = nltk.BigramTagger(train, backoff=unigram_tag)
 28 | 
 29 | # custom defined CFG
 30 | cfg = dict()
 31 | cfg['NNP+NNP'] = 'NNP'
 32 | cfg['NN+NN'] = 'NNI'
 33 | cfg['NNI+NN'] = 'NNI'
 34 | cfg['JJ+JJ'] = 'JJ'
 35 | cfg['JJ+NN'] = 'NNI'
 36 | 
 37 | 
 38 | class ContextExtract():
 39 |     """
 40 |     Extracts context of the text content, relevant topics from the text
 41 |     """
 42 | 
 43 |     def get_info(self, content):
 44 |         words = util.getWords(content)
 45 |         temp_tags = bigram_tag.tag(words)
 46 |         tags = self.re_tag(temp_tags)
 47 |         normalized = True
 48 |         while normalized:
 49 |             normalized = False
 50 |             for i in range(0, len(tags) - 1):
 51 |                 tagged1 = tags[i]
 52 |                 if i+1 >= len(tags):
 53 |                     break
 54 |                 tagged2 = tags[i+1]
 55 |                 key = tagged1[1] + '+' + tagged2[1]
 56 |                 pos = cfg.get(key)
 57 |                 if pos:
 58 |                     tags.pop(i)
 59 |                     tags.pop(i)
 60 |                     re_tagged = tagged1[0] + ' ' + tagged2[0]
 61 |                     tags.insert(i, (re_tagged, pos))
 62 |                     normalized = True
 63 | 
 64 |         final_context = []
 65 |         for tag in tags:
 66 |             if tag[1] == 'NNP' or tag[1] == 'NNI':
 67 |                 final_context.append(tag[0])
 68 |         return final_context
 69 | 
 70 |     def re_tag(self, tagged):
 71 |         new_tagged = []
 72 |         for tag in tagged:
 73 |             if tag[1] == 'NP' or tag[1] == 'NP-TL':
 74 |                 new_tagged.append((tag[0], 'NNP'))
 75 |             elif tag[1][-3:] == '-TL':
 76 |                 new_tagged.append((tag[0], tag[1][:-3]))
 77 |             elif tag[1][-1:] == 'S':
 78 |                 new_tagged.append((tag[0], tag[1][:-1]))
 79 |             else:
 80 |                 new_tagged.append((tag[0], tag[1]))
 81 |         return new_tagged
 82 | 
 83 | 
 84 | def main():
 85 |         # content = raw_input("Content: ")
 86 |         content = """
 87 |             The BBC has been testing a new service called SoundIndex, which
 88 |             lists the top 1,000 artists based on discussions crawled from Bebo,
 89 |             Last.fm, Google Groups, iTunes, MySpace and YouTube. The top five
 90 |             bands according to SoundIndex right now are Coldplay, Rihanna, The
 91 |             Ting Tings, Duffy and Mariah Carey , but the index is refreshed
 92 |             every six hours. SoundIndex also lets users sort by popular tracks,
 93 |             search by artist, or create customized charts based on music
 94 |             preferences or filters by age range, sex or location. Results can
 95 |             also be limited to just one data source (such as Last.fm).
 96 |         """
 97 |         np = ContextExtract()
 98 |         context = np.get_info(content)
 99 |         print context
100 | #main()
101 | 


--------------------------------------------------------------------------------
/getSummary.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Script to score sentences to get summary
  5 | """
  6 | 
  7 | import nltk
  8 | from nltk.corpus import stopwords
  9 | import util
 10 | import re
 11 | 
 12 | 
 13 | def format_sentence(sentence):
 14 |     sentence = re.sub(r'\W+', '', sentence)
 15 |     return sentence
 16 | 
 17 | 
 18 | def scoreSentences(sen1, sen2):
 19 |     """
 20 |     Compares two sentences, find intersection and scores them
 21 |     :param sen1: (str) sentence
 22 |     :param sen2: (str) sentence
 23 |     :returns: score
 24 |     """
 25 |     # TODO: Better scoring algorithm
 26 |     # sen1 = format_sentence(sen1)
 27 |     # sen2 = format_sentence(sen2)
 28 |     s1 = set(sen1.lower().split())
 29 |     s2 = set(sen2.lower().split())
 30 |     score = 0
 31 |     if s1 and s2:
 32 |         avg = len(s1)+len(s2) / 2.0
 33 |         score = len(s1.intersection(s2)) / avg
 34 |     return score
 35 | 
 36 | 
 37 | def remove_stopwords(sentences):
 38 |     """
 39 |     Removes stopwords from the sentence
 40 |     :param sentences: (list) sentences
 41 |     :returns: cleaned sentences without any stopwords
 42 |     """
 43 |     sw = set(stopwords.words('english'))
 44 |     cleaned = []
 45 |     for sentence in sentences:
 46 |         words = util.getWords(sentence)
 47 |         sentence = ' '.join([c for c in words if c not in sw])
 48 |         cleaned.append(sentence)
 49 |     return cleaned
 50 | 
 51 | 
 52 | def sentenceGraph(sentences):
 53 |     """
 54 |     Creates all pair score graph of sentences
 55 |     :param sentences: (list) list of sentences
 56 |     :returns: graph containing of all pair of sentence scores
 57 |     """
 58 |     scoreGraph = []
 59 |     len_sen = len(sentences)
 60 |     for i in range(len_sen):
 61 |         weight = []
 62 |         for j in range(len_sen):
 63 |             sentenceScore = 0
 64 |             if i == j:
 65 |                 continue
 66 |             else:
 67 |                 sentenceScore = scoreSentences(sentences[i], sentences[j])
 68 |             weight.append(sentenceScore)
 69 |         scoreGraph.append(weight)
 70 | 
 71 |     return scoreGraph
 72 | 
 73 | 
 74 | def build(sentences, scoreGraph, orig_sentences):
 75 |     """
 76 |     Builds the content summary based on the graph
 77 |     :param sentences: (list) list of sentences
 78 |     :param scoreGraph: (list) 2 dimensional list-graph of scores
 79 |     :returns: Aggregate score of each sentence in `sentences`
 80 |     """
 81 |     aggregateScore = dict()
 82 |     sen = 0
 83 |     for scores in scoreGraph:
 84 |         aggregate = 0
 85 |         for i in scores:
 86 |             aggregate += i
 87 |         sentence = sentences[sen]
 88 |         aggregateScore[orig_sentences[sen]] = aggregate
 89 |         sen += 1
 90 |     return aggregateScore
 91 | 
 92 | 
 93 | def main():
 94 |     """
 95 |     Exectution starts here.
 96 |     Input's the content to be summarized.
 97 |     """
 98 |     # content = raw_input('Content: ')
 99 |     content = """
100 |     The BBC has been testing a new service called SoundIndex, which lists the
101 |     top 1,000 artists based on discussions crawled from Bebo, Last.fm, Google
102 |     Groups, iTunes, MySpace and YouTube. The top five bands according to
103 |     SoundIndex right now are Coldplay, Rihanna, The Ting Tings, Duffy and
104 |     Mariah Carey , but the index is refreshed every six hours. SoundIndex also
105 |     lets users sort by popular tracks, search by artist, or create customized
106 |     charts based on music preferences or filters by age range, sex or location.
107 |     Results can also be limited to just one data source (such as Last.fm).
108 |     """
109 |     paragraphs = util.getParagraphs(content)
110 |     count = 0
111 |     for paragraph in paragraphs:
112 |         if paragraph:
113 |             orig_sentences, indexed = util.getSentences(paragraph)
114 |             sentences = remove_stopwords(orig_sentences)
115 |             graph = sentenceGraph(sentences)
116 |             score = build(sentences, graph, orig_sentences)
117 |         print 'Paragraph: ', count
118 |         count += 1
119 |         for i in indexed:
120 |             print indexed[i], score[indexed[i]]
121 | main()
122 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Django==1.8.5
2 | nltk==3.1
3 | wheel==0.24.0
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from codecs import open
 3 | from os import path
 4 | 
 5 | here = path.abspath(path.dirname(__file__))
 6 | 
 7 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f:
 8 |     long_description = f.read()
 9 | 
10 | setup(
11 |     name='summrizer',
12 | 
13 |     version='1.0.0.dev1',
14 | 
15 |     description='A text summarizing script',
16 |     long_description=long_description,
17 | 
18 |     # The project's main homepage.
19 |     url='https://github.com/vipul-sharma20/summrizer',
20 | 
21 |     # Author details
22 |     author='Vipul Sharma',
23 |     author_email='vipul.sharma20@gmail.com',
24 | 
25 |     license='MIT',
26 | 
27 |     classifiers=[
28 |         'Development Status :: 3 - Alpha',
29 | 
30 |         'Intended Audience :: Developers',
31 |         'Topic :: Software Development :: Build Tools',
32 | 
33 |         'License :: OSI Approved :: MIT License',
34 | 
35 |         'Programming Language :: Python :: 2',
36 |         'Programming Language :: Python :: 2.6',
37 |         'Programming Language :: Python :: 2.7',
38 |     ],
39 | 
40 |     keywords='nlp summary',
41 | 
42 |     packages=find_packages(exclude=['contrib', 'docs', 'tests']),
43 | 
44 |     install_requires=['nltk>=3.1'],
45 | 
46 |     extras_require={
47 |         'dev': ['check-manifest'],
48 |         'test': ['coverage'],
49 |     },
50 | 
51 |     entry_points={
52 |         'console_scripts': [
53 |             'sample=sample:main',
54 |         ],
55 |     },
56 | )
57 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions for filtering content
 3 | """
 4 | from nltk import tokenize
 5 | from nltk.tokenize import word_tokenize
 6 | 
 7 | 
 8 | def getWords(sentence):
 9 |     """
10 |     Extracts words/tokens from a sentence
11 |     :param sentence: (str) sentence
12 |     :returns: list of tokens
13 |     """
14 |     words = word_tokenize(sentence)
15 |     return words
16 | 
17 | 
18 | def getParagraphs(content):
19 |     """
20 |     Exctracts paragraphs from the the text content
21 |     :param content: (str) text content
22 |     :returns: list of paragraphs
23 |     """
24 |     paraList = content.split('\n\n')
25 |     return paraList
26 | 
27 | 
28 | def getSentences(paragraph):
29 |     """
30 |     Extracts sentences from a paragraph
31 |     :param paragraph: (str) paragraph text
32 |     :returns: list of sentences
33 |     """
34 |     indexed = {}
35 |     i = 0
36 |     sentenceList = tokenize.sent_tokenize(paragraph)
37 |     for s in sentenceList:
38 |         indexed[i] = s
39 |         i += 1
40 |     return sentenceList, indexed
41 | 


--------------------------------------------------------------------------------