├── README.md └── summarizer.py /README.md: -------------------------------------------------------------------------------- 1 | # Summarizer 2 | An automatic paraphraser/summarizer/information extractor built using Python. 3 | ## Usage 4 | ### Summarizing a paragraph of text 5 | This is most likely what you're looking for. To summarize text, simply use `Summarizer`'s `summarize()` function: 6 | 7 | summarizer.summarize("insert text here", length=3) 8 | where `length` equals the number of sentences to condense the text down to. 9 | 10 | **Note:** The second argument can be omitted and will default to `3`. In other words, if you omit the `length` argument, the function will return a three sentence summary. 11 | 12 | ### Functions 13 | `remove_punctuation(text)`: Removes punctuation and converts all letters to lowercase. 14 | 15 | `get_words(text)`: Returns a list of all the words found in `text`. 16 | 17 | `get_sentences(text)`: Returns a list of all the sentences found in `text`. 18 | 19 | `get_word_score(text)`: Counts the number of times a word appears in `text` an returns this data in a dictionary in the format: `{ "word": # of times in text, ...}` 20 | 21 | `get_sentence_score(text)`: Adds the score of each word in the sentences of `text` and returns this data in a dictionary in the format: `{ "sentence": score, ...}` 22 | 23 | `summarize(text)`: Described [above](https://github.com/Blue9/Summarizer#summarizing-a-paragraph-of-text). 24 | -------------------------------------------------------------------------------- /summarizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | 4 | ignore = ['', 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us', 'is', 'am', 'are', 'was', 'were', 'being', 'been', 'has', 'have', 'had', 'do', 'does', 'did', 'shall', 'will', 'should', 'would', 'may', 'might', 'must', 'can', 'could'] 5 | 6 | def remove_puncuation(text): 7 | return re.sub('[\"\'\[\]@#\$%\^&\*\(\)\-_=\+,\.\?<>{}\|]', '', text.lower()) 8 | 9 | def get_words(text): 10 | refined = '' 11 | sentences = get_sentences(text) 12 | for sentence in sentences: 13 | refined += sentence[1] + ' ' 14 | raw_words = re.split(' ', remove_puncuation(refined)) 15 | word_list = [word for word in raw_words if word not in ignore] 16 | return word_list 17 | 18 | def get_sentences(text): 19 | sentence_list = [] 20 | start = 0 21 | for i, char in enumerate(text): 22 | if i == len(text) - 1: 23 | sentence_list.append([text[start:], text[start:]]) 24 | elif char == '.' and text[i+1].upper() not in '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ': 25 | for j, nextchar in enumerate(text[i+1:]): 26 | if nextchar == ' ': 27 | end = i + 1 + j 28 | sentence_list.append([text[start:end], text[start:i+1]]) 29 | start = end + 1 30 | break 31 | return sentence_list 32 | 33 | def get_word_score(text): 34 | word_score = {} 35 | for word in get_words(text): 36 | if word in word_score: 37 | word_score[word] += 1 38 | else: 39 | word_score[word] = 0 40 | return word_score 41 | 42 | def get_sentence_score(text): 43 | sentence_score = {} 44 | for sentence in get_sentences(text): 45 | for word in get_words(sentence[1]): 46 | if sentence[0] in sentence_score: 47 | sentence_score[sentence[0]] += get_word_score(text)[word] 48 | else: 49 | sentence_score[sentence[0]] = 0 50 | return sentence_score 51 | 52 | def summarize(text, length=3): 53 | sentence_score = get_sentence_score(text) 54 | max = [] 55 | for sentence in sentence_score: 56 | if len(max) < length: 57 | max.append([sentence, sentence_score[sentence]]) 58 | else: 59 | min = ['', sys.maxint] 60 | for i, sent in enumerate(max): 61 | if sent[1] < min[1]: 62 | min = [i, sent[1]] 63 | if sentence_score[sentence] > min[1]: 64 | max[min[0]] = [sentence, sentence_score[sentence]] 65 | summary = '' 66 | for sentence in get_sentences(text): 67 | if [sentence[0], sentence_score[sentence[0]]] in max: 68 | summary += sentence[0] + ' ' 69 | return summary 70 | 71 | def prompt_user(): 72 | print summarize(str(raw_input())) 73 | --------------------------------------------------------------------------------