├── README.md
└── summarizer.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Summarizer
 2 | An automatic paraphraser/summarizer/information extractor built using Python.
 3 | ## Usage
 4 | ### Summarizing a paragraph of text
 5 | This is most likely what you're looking for. To summarize text, simply use `Summarizer`'s `summarize()` function:
 6 | 
 7 |     summarizer.summarize("insert text here", length=3)
 8 | where `length` equals the number of sentences to condense the text down to.
 9 | 
10 | **Note:** The second argument can be omitted and will default to `3`. In other words, if you omit the `length` argument, the function will return a three sentence summary.
11 | 
12 | ### Functions
13 | `remove_punctuation(text)`: Removes punctuation and converts all letters to lowercase.
14 | 
15 | `get_words(text)`: Returns a list of all the words found in `text`.
16 | 
17 | `get_sentences(text)`: Returns a list of all the sentences found in `text`.
18 | 
19 | `get_word_score(text)`: Counts the number of times a word appears in `text` an returns this data in a dictionary in the format: `{ "word": # of times in text, ...}`
20 | 
21 | `get_sentence_score(text)`: Adds the score of each word in the sentences of `text` and returns this data in a dictionary in the format: `{ "sentence": score, ...}`
22 | 
23 | `summarize(text)`: Described [above](https://github.com/Blue9/Summarizer#summarizing-a-paragraph-of-text).
24 | 


--------------------------------------------------------------------------------
/summarizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | 
 4 | ignore = ['', 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us', 'is', 'am', 'are', 'was', 'were', 'being', 'been', 'has', 'have', 'had', 'do', 'does', 'did', 'shall', 'will', 'should', 'would', 'may', 'might', 'must', 'can', 'could']
 5 | 
 6 | def remove_puncuation(text):
 7 |     return re.sub('[\"\'\[\]@#\$%\^&\*\(\)\-_=\+,\.\?<>{}\|]', '', text.lower())
 8 | 
 9 | def get_words(text):
10 |     refined = ''
11 |     sentences = get_sentences(text)
12 |     for sentence in sentences:
13 |         refined += sentence[1] + ' '
14 |     raw_words = re.split(' ', remove_puncuation(refined))
15 |     word_list = [word for word in raw_words if word not in ignore]
16 |     return word_list
17 | 
18 | def get_sentences(text):
19 |     sentence_list = []
20 |     start = 0
21 |     for i, char in enumerate(text):
22 |         if i == len(text) - 1:
23 |             sentence_list.append([text[start:], text[start:]])
24 |         elif char == '.' and text[i+1].upper() not in '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ':
25 |             for j, nextchar in enumerate(text[i+1:]):
26 |                 if nextchar == ' ':
27 |                     end = i + 1 + j
28 |                     sentence_list.append([text[start:end], text[start:i+1]])
29 |                     start = end + 1
30 |                     break
31 |     return sentence_list
32 | 
33 | def get_word_score(text):
34 |     word_score = {}
35 |     for word in get_words(text):
36 |         if word in word_score:
37 |             word_score[word] += 1
38 |         else:
39 |             word_score[word] = 0
40 |     return word_score
41 | 
42 | def get_sentence_score(text):
43 |     sentence_score = {}
44 |     for sentence in get_sentences(text):
45 |         for word in get_words(sentence[1]):
46 |             if sentence[0] in sentence_score:
47 |                 sentence_score[sentence[0]] += get_word_score(text)[word]
48 |             else:
49 |                 sentence_score[sentence[0]] = 0
50 |     return sentence_score
51 | 
52 | def summarize(text, length=3):
53 |     sentence_score = get_sentence_score(text)
54 |     max = []
55 |     for sentence in sentence_score:
56 |         if len(max) < length:
57 |             max.append([sentence, sentence_score[sentence]])
58 |         else:
59 |             min = ['', sys.maxint]
60 |             for i, sent in enumerate(max):
61 |                 if sent[1] < min[1]:
62 |                     min = [i, sent[1]]
63 |             if sentence_score[sentence] > min[1]:
64 |                 max[min[0]] = [sentence, sentence_score[sentence]]
65 |     summary = ''
66 |     for sentence in get_sentences(text):
67 |         if [sentence[0], sentence_score[sentence[0]]] in max:
68 |             summary += sentence[0] + ' '
69 |     return summary
70 | 
71 | def prompt_user():
72 |     print summarize(str(raw_input()))
73 | 


--------------------------------------------------------------------------------