├── LICENSE ├── README.md ├── requirements.txt ├── text_summarizer.py └── url_summarizer.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Assaf Elovic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # "TL;DR" for online articles 2 | ### Text summarizer for online articles using web crawling and NLP (written in Python 3.x) 3 | 4 | ## Getting started 5 | ### 1. Open command line (Terminal on Mac) 6 | ```pip install -r requirements.txt``` 7 | 8 | ### 2. Open Python command line 9 | ``` 10 | import nltk 11 | nltk.download("stopwords") 12 | ``` 13 | 14 | ### 3. Run the process and enter any URL 15 | ```python url_summarizer.py {url}``` 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | nltk 3 | numpy 4 | setuptools 5 | sumy 6 | requests -------------------------------------------------------------------------------- /text_summarizer.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import sent_tokenize,word_tokenize 2 | from nltk.corpus import stopwords 3 | from collections import defaultdict 4 | from string import punctuation 5 | from heapq import nlargest 6 | 7 | class FrequencySummarizer: 8 | def __init__(self, min_cut=0.1, max_cut=0.9): 9 | """ 10 | Initialize the text summarizer. 11 | Words that have a frequency term lower than min_cut 12 | or higher than max_cut will be ignored. 13 | """ 14 | self._min_cut = min_cut 15 | self._max_cut = max_cut 16 | self._stopwords = set(stopwords.words('english') + list(punctuation)) 17 | 18 | def _compute_frequencies(self, word_sent): 19 | """ 20 | Compute the frequency of each of word. 21 | Input: 22 | word_sent, a list of sentences already tokenized. 23 | Output: 24 | freq, a dictionary where freq[w] is the frequency of w. 25 | """ 26 | freq = defaultdict(int) 27 | for s in word_sent: 28 | for word in s: 29 | if word not in self._stopwords: 30 | freq[word] += 1 31 | # frequencies normalization and filtering 32 | m = float(max(freq.values())) 33 | for w in list(freq): 34 | freq[w] = freq[w]/m 35 | if freq[w] >= self._max_cut or freq[w] <= self._min_cut: 36 | del freq[w] 37 | return freq 38 | 39 | def summarize(self, text, n): 40 | """ 41 | Return a list of n sentences 42 | which represent the summary of text. 43 | """ 44 | sents = sent_tokenize(text) 45 | assert n <= len(sents) 46 | word_sent = [word_tokenize(s.lower()) for s in sents] 47 | self._freq = self._compute_frequencies(word_sent) 48 | ranking = defaultdict(int) 49 | for i,sent in enumerate(word_sent): 50 | for w in sent: 51 | if w in self._freq: 52 | ranking[i] += self._freq[w] 53 | sents_idx = self._rank(ranking, n) 54 | return [sents[j] for j in sents_idx] 55 | 56 | def _rank(self, ranking, n): 57 | """ return the first n sentences with highest ranking """ 58 | return nlargest(n, ranking, key=ranking.get) -------------------------------------------------------------------------------- /url_summarizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | 3 | from bs4 import BeautifulSoup 4 | from text_summarizer import FrequencySummarizer 5 | import requests 6 | import sys 7 | 8 | def getTextFromURL(url): 9 | r = requests.get(url) 10 | soup = BeautifulSoup(r.text, "html.parser") 11 | text = ' '.join(map(lambda p: p.text, soup.find_all('p'))) 12 | return text 13 | 14 | def summarizeURL(url, total_pars): 15 | url_text = getTextFromURL(url).replace(u"Â", u"").replace(u"â", u"") 16 | 17 | fs = FrequencySummarizer() 18 | final_summary = fs.summarize(url_text.replace("\n"," "), total_pars) 19 | return " ".join(final_summary) 20 | 21 | url = str(input("Enter an article URL\n")) if len(sys.argv) < 1 else sys.argv[1] 22 | final_summary = summarizeURL(url, 5) 23 | print(final_summary) 24 | 25 | --------------------------------------------------------------------------------