├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── context.py ├── getSummary.py ├── requirements.txt ├── setup.py └── util.py /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | - "3.2" 6 | - "3.3" 7 | - "3.4" 8 | - "3.5" 9 | - "3.5-dev" # 3.5 development branch 10 | - "nightly" # currently points to 3.6-dev 11 | # command to install dependencies 12 | install: "pip install -r requirements.txt" 13 | # # command to run tests 14 | script: nosetests 15 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | This project is licensed under MIT License: 2 | 3 | Copyright (c) 2015-2016: Vipul Sharma 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | 11 | This project uses following external libraries, which have their own licenses: 12 | 13 | [NLTK](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) [Apache] 14 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the license file 2 | include LICENSE.txt 3 | 4 | # Include the data files 5 | recursive-include . * 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | SUMMRIZER 2 | ========= 3 | 4 | A naive summary script to extract summary of text content 5 | 6 | PLEASE read the blog post. It contains complete info 7 | 8 | * Blog Post: http://www.vipul.xyz/2015/10/summrizer-text-summarizer_27.html 9 | 10 | Testing results against: 11 | 12 | * http://smmry.com 13 | * http://freesummarizer.com 14 | * http://autosummarizer.com/index.php 15 | 16 | LICENSE 17 | ======= 18 | 19 | This project is licensed under MIT License: 20 | 21 | Copyright (c) 2015-2016: Vipul Sharma 22 | 23 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 24 | 25 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 26 | 27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | 29 | This project uses following external libraries, which have their own licenses: 30 | 31 | * [NLTK](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) [Apache] 32 | -------------------------------------------------------------------------------- /context.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Script to extract important topics from content 5 | """ 6 | 7 | import nltk 8 | from nltk.corpus import brown 9 | import util 10 | 11 | 12 | train = brown.tagged_sents(categories='news') 13 | 14 | # backoff regex tagging 15 | regex_tag = nltk.RegexpTagger([ 16 | (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'), 17 | (r'.*able$', 'JJ'), 18 | (r'^[A-Z].*$', 'NNP'), 19 | (r'.*ly$', 'RB'), 20 | (r'.*s$', 'NNS'), 21 | (r'.*ing$', 'VBG'), 22 | (r'.*ed$', 'VBD'), 23 | (r'.*', 'NN') 24 | ]) 25 | 26 | unigram_tag = nltk.UnigramTagger(train, backoff=regex_tag) 27 | bigram_tag = nltk.BigramTagger(train, backoff=unigram_tag) 28 | 29 | # custom defined CFG 30 | cfg = dict() 31 | cfg['NNP+NNP'] = 'NNP' 32 | cfg['NN+NN'] = 'NNI' 33 | cfg['NNI+NN'] = 'NNI' 34 | cfg['JJ+JJ'] = 'JJ' 35 | cfg['JJ+NN'] = 'NNI' 36 | 37 | 38 | class ContextExtract(): 39 | """ 40 | Extracts context of the text content, relevant topics from the text 41 | """ 42 | 43 | def get_info(self, content): 44 | words = util.getWords(content) 45 | temp_tags = bigram_tag.tag(words) 46 | tags = self.re_tag(temp_tags) 47 | normalized = True 48 | while normalized: 49 | normalized = False 50 | for i in range(0, len(tags) - 1): 51 | tagged1 = tags[i] 52 | if i+1 >= len(tags): 53 | break 54 | tagged2 = tags[i+1] 55 | key = tagged1[1] + '+' + tagged2[1] 56 | pos = cfg.get(key) 57 | if pos: 58 | tags.pop(i) 59 | tags.pop(i) 60 | re_tagged = tagged1[0] + ' ' + tagged2[0] 61 | tags.insert(i, (re_tagged, pos)) 62 | normalized = True 63 | 64 | final_context = [] 65 | for tag in tags: 66 | if tag[1] == 'NNP' or tag[1] == 'NNI': 67 | final_context.append(tag[0]) 68 | return final_context 69 | 70 | def re_tag(self, tagged): 71 | new_tagged = [] 72 | for tag in tagged: 73 | if tag[1] == 'NP' or tag[1] == 'NP-TL': 74 | new_tagged.append((tag[0], 'NNP')) 75 | elif tag[1][-3:] == '-TL': 76 | new_tagged.append((tag[0], tag[1][:-3])) 77 | elif tag[1][-1:] == 'S': 78 | new_tagged.append((tag[0], tag[1][:-1])) 79 | else: 80 | new_tagged.append((tag[0], tag[1])) 81 | return new_tagged 82 | 83 | 84 | def main(): 85 | # content = raw_input("Content: ") 86 | content = """ 87 | The BBC has been testing a new service called SoundIndex, which 88 | lists the top 1,000 artists based on discussions crawled from Bebo, 89 | Last.fm, Google Groups, iTunes, MySpace and YouTube. The top five 90 | bands according to SoundIndex right now are Coldplay, Rihanna, The 91 | Ting Tings, Duffy and Mariah Carey , but the index is refreshed 92 | every six hours. SoundIndex also lets users sort by popular tracks, 93 | search by artist, or create customized charts based on music 94 | preferences or filters by age range, sex or location. Results can 95 | also be limited to just one data source (such as Last.fm). 96 | """ 97 | np = ContextExtract() 98 | context = np.get_info(content) 99 | print context 100 | #main() 101 | -------------------------------------------------------------------------------- /getSummary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Script to score sentences to get summary 5 | """ 6 | 7 | import nltk 8 | from nltk.corpus import stopwords 9 | import util 10 | import re 11 | 12 | 13 | def format_sentence(sentence): 14 | sentence = re.sub(r'\W+', '', sentence) 15 | return sentence 16 | 17 | 18 | def scoreSentences(sen1, sen2): 19 | """ 20 | Compares two sentences, find intersection and scores them 21 | :param sen1: (str) sentence 22 | :param sen2: (str) sentence 23 | :returns: score 24 | """ 25 | # TODO: Better scoring algorithm 26 | # sen1 = format_sentence(sen1) 27 | # sen2 = format_sentence(sen2) 28 | s1 = set(sen1.lower().split()) 29 | s2 = set(sen2.lower().split()) 30 | score = 0 31 | if s1 and s2: 32 | avg = len(s1)+len(s2) / 2.0 33 | score = len(s1.intersection(s2)) / avg 34 | return score 35 | 36 | 37 | def remove_stopwords(sentences): 38 | """ 39 | Removes stopwords from the sentence 40 | :param sentences: (list) sentences 41 | :returns: cleaned sentences without any stopwords 42 | """ 43 | sw = set(stopwords.words('english')) 44 | cleaned = [] 45 | for sentence in sentences: 46 | words = util.getWords(sentence) 47 | sentence = ' '.join([c for c in words if c not in sw]) 48 | cleaned.append(sentence) 49 | return cleaned 50 | 51 | 52 | def sentenceGraph(sentences): 53 | """ 54 | Creates all pair score graph of sentences 55 | :param sentences: (list) list of sentences 56 | :returns: graph containing of all pair of sentence scores 57 | """ 58 | scoreGraph = [] 59 | len_sen = len(sentences) 60 | for i in range(len_sen): 61 | weight = [] 62 | for j in range(len_sen): 63 | sentenceScore = 0 64 | if i == j: 65 | continue 66 | else: 67 | sentenceScore = scoreSentences(sentences[i], sentences[j]) 68 | weight.append(sentenceScore) 69 | scoreGraph.append(weight) 70 | 71 | return scoreGraph 72 | 73 | 74 | def build(sentences, scoreGraph, orig_sentences): 75 | """ 76 | Builds the content summary based on the graph 77 | :param sentences: (list) list of sentences 78 | :param scoreGraph: (list) 2 dimensional list-graph of scores 79 | :returns: Aggregate score of each sentence in `sentences` 80 | """ 81 | aggregateScore = dict() 82 | sen = 0 83 | for scores in scoreGraph: 84 | aggregate = 0 85 | for i in scores: 86 | aggregate += i 87 | sentence = sentences[sen] 88 | aggregateScore[orig_sentences[sen]] = aggregate 89 | sen += 1 90 | return aggregateScore 91 | 92 | 93 | def main(): 94 | """ 95 | Exectution starts here. 96 | Input's the content to be summarized. 97 | """ 98 | # content = raw_input('Content: ') 99 | content = """ 100 | The BBC has been testing a new service called SoundIndex, which lists the 101 | top 1,000 artists based on discussions crawled from Bebo, Last.fm, Google 102 | Groups, iTunes, MySpace and YouTube. The top five bands according to 103 | SoundIndex right now are Coldplay, Rihanna, The Ting Tings, Duffy and 104 | Mariah Carey , but the index is refreshed every six hours. SoundIndex also 105 | lets users sort by popular tracks, search by artist, or create customized 106 | charts based on music preferences or filters by age range, sex or location. 107 | Results can also be limited to just one data source (such as Last.fm). 108 | """ 109 | paragraphs = util.getParagraphs(content) 110 | count = 0 111 | for paragraph in paragraphs: 112 | if paragraph: 113 | orig_sentences, indexed = util.getSentences(paragraph) 114 | sentences = remove_stopwords(orig_sentences) 115 | graph = sentenceGraph(sentences) 116 | score = build(sentences, graph, orig_sentences) 117 | print 'Paragraph: ', count 118 | count += 1 119 | for i in indexed: 120 | print indexed[i], score[indexed[i]] 121 | main() 122 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Django==1.8.5 2 | nltk==3.1 3 | wheel==0.24.0 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | from os import path 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | 7 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f: 8 | long_description = f.read() 9 | 10 | setup( 11 | name='summrizer', 12 | 13 | version='1.0.0.dev1', 14 | 15 | description='A text summarizing script', 16 | long_description=long_description, 17 | 18 | # The project's main homepage. 19 | url='https://github.com/vipul-sharma20/summrizer', 20 | 21 | # Author details 22 | author='Vipul Sharma', 23 | author_email='vipul.sharma20@gmail.com', 24 | 25 | license='MIT', 26 | 27 | classifiers=[ 28 | 'Development Status :: 3 - Alpha', 29 | 30 | 'Intended Audience :: Developers', 31 | 'Topic :: Software Development :: Build Tools', 32 | 33 | 'License :: OSI Approved :: MIT License', 34 | 35 | 'Programming Language :: Python :: 2', 36 | 'Programming Language :: Python :: 2.6', 37 | 'Programming Language :: Python :: 2.7', 38 | ], 39 | 40 | keywords='nlp summary', 41 | 42 | packages=find_packages(exclude=['contrib', 'docs', 'tests']), 43 | 44 | install_requires=['nltk>=3.1'], 45 | 46 | extras_require={ 47 | 'dev': ['check-manifest'], 48 | 'test': ['coverage'], 49 | }, 50 | 51 | entry_points={ 52 | 'console_scripts': [ 53 | 'sample=sample:main', 54 | ], 55 | }, 56 | ) 57 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for filtering content 3 | """ 4 | from nltk import tokenize 5 | from nltk.tokenize import word_tokenize 6 | 7 | 8 | def getWords(sentence): 9 | """ 10 | Extracts words/tokens from a sentence 11 | :param sentence: (str) sentence 12 | :returns: list of tokens 13 | """ 14 | words = word_tokenize(sentence) 15 | return words 16 | 17 | 18 | def getParagraphs(content): 19 | """ 20 | Exctracts paragraphs from the the text content 21 | :param content: (str) text content 22 | :returns: list of paragraphs 23 | """ 24 | paraList = content.split('\n\n') 25 | return paraList 26 | 27 | 28 | def getSentences(paragraph): 29 | """ 30 | Extracts sentences from a paragraph 31 | :param paragraph: (str) paragraph text 32 | :returns: list of sentences 33 | """ 34 | indexed = {} 35 | i = 0 36 | sentenceList = tokenize.sent_tokenize(paragraph) 37 | for s in sentenceList: 38 | indexed[i] = s 39 | i += 1 40 | return sentenceList, indexed 41 | --------------------------------------------------------------------------------