├── LICENSE
├── README.md
├── requirements.txt
├── text_summarizer.py
└── url_summarizer.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Assaf Elovic
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # "TL;DR" for online articles
 2 | ### Text summarizer for online articles using web crawling and NLP (written in Python 3.x)
 3 | 
 4 | ## Getting started
 5 | ### 1. Open command line (Terminal on Mac)
 6 | ```pip install -r requirements.txt```
 7 | 
 8 | ### 2. Open Python command line
 9 | ```
10 | import nltk
11 | nltk.download("stopwords")
12 | ```
13 | 
14 | ### 3. Run the process and enter any URL
15 | ```python url_summarizer.py {url}```
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | nltk
3 | numpy
4 | setuptools
5 | sumy
6 | requests


--------------------------------------------------------------------------------
/text_summarizer.py:
--------------------------------------------------------------------------------
 1 | from nltk.tokenize import sent_tokenize,word_tokenize
 2 | from nltk.corpus import stopwords
 3 | from collections import defaultdict
 4 | from string import punctuation
 5 | from heapq import nlargest
 6 | 
 7 | class FrequencySummarizer:
 8 |   def __init__(self, min_cut=0.1, max_cut=0.9):
 9 |     """
10 |      Initialize the text summarizer.
11 |      Words that have a frequency term lower than min_cut
12 |      or higher than max_cut will be ignored.
13 |     """
14 |     self._min_cut = min_cut
15 |     self._max_cut = max_cut
16 |     self._stopwords = set(stopwords.words('english') + list(punctuation))
17 | 
18 |   def _compute_frequencies(self, word_sent):
19 |     """
20 |       Compute the frequency of each of word.
21 |       Input:
22 |        word_sent, a list of sentences already tokenized.
23 |       Output:
24 |        freq, a dictionary where freq[w] is the frequency of w.
25 |     """
26 |     freq = defaultdict(int)
27 |     for s in word_sent:
28 |       for word in s:
29 |         if word not in self._stopwords:
30 |           freq[word] += 1
31 |     # frequencies normalization and filtering
32 |     m = float(max(freq.values()))
33 |     for w in list(freq):
34 |       freq[w] = freq[w]/m
35 |       if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
36 |         del freq[w]
37 |     return freq
38 | 
39 |   def summarize(self, text, n):
40 |     """
41 |       Return a list of n sentences
42 |       which represent the summary of text.
43 |     """
44 |     sents = sent_tokenize(text)
45 |     assert n <= len(sents)
46 |     word_sent = [word_tokenize(s.lower()) for s in sents]
47 |     self._freq = self._compute_frequencies(word_sent)
48 |     ranking = defaultdict(int)
49 |     for i,sent in enumerate(word_sent):
50 |       for w in sent:
51 |         if w in self._freq:
52 |           ranking[i] += self._freq[w]
53 |     sents_idx = self._rank(ranking, n)
54 |     return [sents[j] for j in sents_idx]
55 | 
56 |   def _rank(self, ranking, n):
57 |     """ return the first n sentences with highest ranking """
58 |     return nlargest(n, ranking, key=ranking.get)


--------------------------------------------------------------------------------
/url_summarizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf8 -*-
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from text_summarizer import FrequencySummarizer
 5 | import requests
 6 | import sys
 7 | 
 8 | def getTextFromURL(url):
 9 | 	r = requests.get(url)
10 | 	soup = BeautifulSoup(r.text, "html.parser")
11 | 	text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
12 | 	return text
13 | 
14 | def summarizeURL(url, total_pars):
15 | 	url_text = getTextFromURL(url).replace(u"Â", u"").replace(u"â", u"")
16 | 
17 | 	fs = FrequencySummarizer()
18 | 	final_summary = fs.summarize(url_text.replace("\n"," "), total_pars)
19 | 	return " ".join(final_summary)
20 | 
21 | url = str(input("Enter an article URL\n")) if len(sys.argv) < 1 else sys.argv[1]
22 | final_summary = summarizeURL(url, 5)
23 | print(final_summary)
24 | 
25 | 


--------------------------------------------------------------------------------