├── .gitignore
├── LICENSE.txt
├── README.md
├── readability.py
├── syllables_en.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2001-2011 NLTK Project
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the 'License');
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an 'AS IS' BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Readability
 2 | ====================
 3 | 
 4 | A collection of functions that measure the readability of a given body of text. I'd
 5 | recommend checking out the wikipedia articles below--most of the metrics estimate 
 6 | the grade level required to comprehend a given block of text and may return odd results
 7 | on small snippets of text.
 8 | 
 9 | To get up and running you'll need [NLTK](http://nltk.org/) and will need the punkt
10 | data set:
11 | 
12 |     shell$ pip install nltk
13 |     shell$ python
14 |     >>import nltk
15 |     >>nltk.download('punkt')
16 | 
17 | Demo:
18 | 
19 |     shell$ python readability.py
20 |     Test text:
21 |     "We are close to wrapping up our 10 week Rails Course. This week we will cover a handful of topics commonly encountered in Rails projects. We then wrap up with part 2 of our Reddit on Rails exercise!  By now you should be hard at work on your personal projects. The students in the course just presented in front of the class with some live demos and a brief intro to to the problems their app were solving. Maybe set aside some time this week to show someone your progress, block off 5 minutes and describe what goal you are working towards, the current state of the project (is it almost done, just getting started, needs UI, etc.), and then show them a quick demo of the app. Explain what type of feedback you are looking for (conceptual, design, usability, etc.) and see what they have to say.  As we are wrapping up the course you need to be focused on learning as much as you can, but also making sure you have the tools to succeed after the class is over."
22 | 
23 |     ARI:  7.2164516129
24 |     FleschReadingEase:  88.9553
25 |     FleschKincaidGradeLevel:  5.3235
26 |     GunningFogIndex:  9.1355
27 |     SMOGIndex:  8.19615242271
28 |     ColemanLiauIndex:  6.7804
29 |     LIX:  35.2666666667
30 |     RIX:  3.1
31 | 
32 | The following readability metrics are included in readability.py:
33 | 
34 | 1. http://en.wikipedia.org/wiki/Automated_Readability_Index
35 | 2. http://en.wikipedia.org/wiki/SMOG
36 | 3. http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_Grade_Level#Flesch.E2.80.93Kincaid_Grade_Level
37 | 4. http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_test#Flesch_Reading_Ease
38 | 5. http://en.wikipedia.org/wiki/Coleman-Liau_Index
39 | 6. http://en.wikipedia.org/wiki/Gunning-Fog_Index
40 | 
41 | Largely lifted from:
42 | 
43 |     https://github.com/nltk/nltk_contrib/tree/master/nltk_contrib/readability
44 | 
45 | SMOG index appears to perform most accurately.
46 | 


--------------------------------------------------------------------------------
/readability.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import math
  4 | 
  5 | from utils import get_char_count
  6 | from utils import get_words
  7 | from utils import get_sentences
  8 | from utils import count_syllables
  9 | from utils import count_complex_words
 10 | 
 11 | 
 12 | class Readability:
 13 |     analyzedVars = {}
 14 | 
 15 |     def __init__(self, text):
 16 |         self.analyze_text(text)
 17 | 
 18 |     def analyze_text(self, text):
 19 |         words = get_words(text)
 20 |         char_count = get_char_count(words)
 21 |         word_count = len(words)
 22 |         sentence_count = len(get_sentences(text))
 23 |         syllable_count = count_syllables(words)
 24 |         complexwords_count = count_complex_words(text)
 25 |         avg_words_p_sentence = word_count/sentence_count
 26 |         
 27 |         self.analyzedVars = {
 28 |             'words': words,
 29 |             'char_cnt': float(char_count),
 30 |             'word_cnt': float(word_count),
 31 |             'sentence_cnt': float(sentence_count),
 32 |             'syllable_cnt': float(syllable_count),
 33 |             'complex_word_cnt': float(complexwords_count),
 34 |             'avg_words_p_sentence': float(avg_words_p_sentence)
 35 |         }
 36 | 
 37 |     def ARI(self):
 38 |         score = 0.0 
 39 |         if self.analyzedVars['word_cnt'] > 0.0:
 40 |             score = 4.71 * (self.analyzedVars['char_cnt'] / self.analyzedVars['word_cnt']) + 0.5 * (self.analyzedVars['word_cnt'] / self.analyzedVars['sentence_cnt']) - 21.43
 41 |         return score
 42 |         
 43 |     def FleschReadingEase(self):
 44 |         score = 0.0 
 45 |         if self.analyzedVars['word_cnt'] > 0.0:
 46 |             score = 206.835 - (1.015 * (self.analyzedVars['avg_words_p_sentence'])) - (84.6 * (self.analyzedVars['syllable_cnt']/ self.analyzedVars['word_cnt']))
 47 |         return round(score, 4)
 48 |         
 49 |     def FleschKincaidGradeLevel(self):
 50 |         score = 0.0 
 51 |         if self.analyzedVars['word_cnt'] > 0.0:
 52 |             score = 0.39 * (self.analyzedVars['avg_words_p_sentence']) + 11.8 * (self.analyzedVars['syllable_cnt']/ self.analyzedVars['word_cnt']) - 15.59
 53 |         return round(score, 4)
 54 |         
 55 |     def GunningFogIndex(self):
 56 |         score = 0.0 
 57 |         if self.analyzedVars['word_cnt'] > 0.0:
 58 |             score = 0.4 * ((self.analyzedVars['avg_words_p_sentence']) + (100 * (self.analyzedVars['complex_word_cnt']/self.analyzedVars['word_cnt'])))
 59 |         return round(score, 4)
 60 | 
 61 |     def SMOGIndex(self):
 62 |         score = 0.0 
 63 |         if self.analyzedVars['word_cnt'] > 0.0:
 64 |             score = (math.sqrt(self.analyzedVars['complex_word_cnt']*(30/self.analyzedVars['sentence_cnt'])) + 3)
 65 |         return score
 66 | 
 67 |     def ColemanLiauIndex(self):
 68 |         score = 0.0 
 69 |         if self.analyzedVars['word_cnt'] > 0.0:
 70 |             score = (5.89*(self.analyzedVars['char_cnt']/self.analyzedVars['word_cnt']))-(30*(self.analyzedVars['sentence_cnt']/self.analyzedVars['word_cnt']))-15.8
 71 |         return round(score, 4)
 72 | 
 73 |     def LIX(self):
 74 |         longwords = 0.0
 75 |         score = 0.0 
 76 |         if self.analyzedVars['word_cnt'] > 0.0:
 77 |             for word in self.analyzedVars['words']:
 78 |                 if len(word) >= 7:
 79 |                     longwords += 1.0
 80 |             score = self.analyzedVars['word_cnt'] / self.analyzedVars['sentence_cnt'] + float(100 * longwords) / self.analyzedVars['word_cnt']
 81 |         return score
 82 | 
 83 |     def RIX(self):
 84 |         longwords = 0.0
 85 |         score = 0.0 
 86 |         if self.analyzedVars['word_cnt'] > 0.0:
 87 |             for word in self.analyzedVars['words']:
 88 |                 if len(word) >= 7:
 89 |                     longwords += 1.0
 90 |             score = longwords / self.analyzedVars['sentence_cnt']
 91 |         return score
 92 |         
 93 | 
 94 | if __name__ == "__main__":
 95 |     text = """We are close to wrapping up our 10 week Rails Course. This week we will cover a handful of topics commonly encountered in Rails projects. We then wrap up with part 2 of our Reddit on Rails exercise!  By now you should be hard at work on your personal projects. The students in the course just presented in front of the class with some live demos and a brief intro to to the problems their app were solving. Maybe set aside some time this week to show someone your progress, block off 5 minutes and describe what goal you are working towards, the current state of the project (is it almost done, just getting started, needs UI, etc.), and then show them a quick demo of the app. Explain what type of feedback you are looking for (conceptual, design, usability, etc.) and see what they have to say.  As we are wrapping up the course you need to be focused on learning as much as you can, but also making sure you have the tools to succeed after the class is over."""
 96 | 
 97 |     rd = Readability(text)
 98 |     print 'Test text:'
 99 |     print '"%s"\n' % text
100 |     print 'ARI: ', rd.ARI()
101 |     print 'FleschReadingEase: ', rd.FleschReadingEase()
102 |     print 'FleschKincaidGradeLevel: ', rd.FleschKincaidGradeLevel()
103 |     print 'GunningFogIndex: ', rd.GunningFogIndex()
104 |     print 'SMOGIndex: ', rd.SMOGIndex()
105 |     print 'ColemanLiauIndex: ', rd.ColemanLiauIndex()
106 |     print 'LIX: ', rd.LIX()
107 |     print 'RIX: ', rd.RIX()
108 | 
109 | 


--------------------------------------------------------------------------------
/syllables_en.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Fallback syllable counter
  3 | 
  4 | This is based on the algorithm in Greg Fast's perl module
  5 | Lingua::EN::Syllable.
  6 | """
  7 | 
  8 | import string, re, os
  9 | 
 10 | specialSyllables_en = """tottered 2
 11 | chummed 1
 12 | peeped 1
 13 | moustaches 2
 14 | shamefully 3
 15 | messieurs 2
 16 | satiated 4
 17 | sailmaker 4
 18 | sheered 1
 19 | disinterred 3
 20 | propitiatory 6
 21 | bepatched 2
 22 | particularized 5
 23 | caressed 2
 24 | trespassed 2
 25 | sepulchre 3
 26 | flapped 1
 27 | hemispheres 3
 28 | pencilled 2
 29 | motioned 2
 30 | poleman 2
 31 | slandered 2
 32 | sombre 2
 33 | etc 4
 34 | sidespring 2
 35 | mimes 1
 36 | effaces 2
 37 | mr 2
 38 | mrs 2
 39 | ms 1
 40 | dr 2
 41 | st 1
 42 | sr 2
 43 | jr 2
 44 | truckle 2
 45 | foamed 1
 46 | fringed 2
 47 | clattered 2
 48 | capered 2
 49 | mangroves 2
 50 | suavely 2
 51 | reclined 2
 52 | brutes 1
 53 | effaced 2
 54 | quivered 2
 55 | h'm 1
 56 | veriest 3
 57 | sententiously 4
 58 | deafened 2
 59 | manoeuvred 3
 60 | unstained 2
 61 | gaped 1
 62 | stammered 2
 63 | shivered 2
 64 | discoloured 3
 65 | gravesend 2
 66 | 60 2
 67 | lb 1
 68 | unexpressed 3
 69 | greyish 2
 70 | unostentatious 5
 71 | """
 72 | 
 73 | fallback_cache = {}
 74 | 
 75 | fallback_subsyl = ["cial", "tia", "cius", "cious", "gui", "ion", "iou",
 76 |                    "sia$", ".ely$"]
 77 | 
 78 | fallback_addsyl = ["ia", "riet", "dien", "iu", "io", "ii",
 79 |                    "[aeiouy]bl$", "mbl$",
 80 |                    "[aeiou]{3}",
 81 |                    "^mc", "ism$",
 82 |                    "(.)(?!\\1)([aeiouy])\\2l$",
 83 |                    "[^l]llien",
 84 |                    "^coad.", "^coag.", "^coal.", "^coax.",
 85 |                    "(.)(?!\\1)[gq]ua(.)(?!\\2)[aeiou]",
 86 |                    "dnt$"]
 87 | 
 88 | 
 89 | # Compile our regular expressions
 90 | for i in range(len(fallback_subsyl)):
 91 |     fallback_subsyl[i] = re.compile(fallback_subsyl[i])
 92 | for i in range(len(fallback_addsyl)):
 93 |     fallback_addsyl[i] = re.compile(fallback_addsyl[i])
 94 | 
 95 | def _normalize_word(word):
 96 |     return word.strip().lower()
 97 | 
 98 | # Read our syllable override file and stash that info in the cache
 99 | for line in specialSyllables_en.splitlines():
100 |     line = line.strip()
101 |     if line:
102 |         toks = line.split()
103 |         assert len(toks) == 2
104 |         fallback_cache[_normalize_word(toks[0])] = int(toks[1])
105 | 
106 | def count(word):
107 |     word = _normalize_word(word)
108 |     if not word:
109 |         return 0
110 | 
111 |     # Check for a cached syllable count
112 |     count = fallback_cache.get(word, -1)
113 |     if count > 0:
114 |         return count
115 | 
116 |     # Remove final silent 'e'
117 |     if word[-1] == "e":
118 |         word = word[:-1]
119 | 
120 |     # Count vowel groups
121 |     count = 0
122 |     prev_was_vowel = 0
123 |     for c in word:
124 |         is_vowel = c in ("a", "e", "i", "o", "u", "y")
125 |         if is_vowel and not prev_was_vowel:
126 |             count += 1
127 |         prev_was_vowel = is_vowel
128 | 
129 |     # Add & subtract syllables
130 |     for r in fallback_addsyl:
131 |         if r.search(word):
132 |             count += 1
133 |     for r in fallback_subsyl:
134 |         if r.search(word):
135 |             count -= 1
136 | 
137 |     # Cache the syllable count
138 |     fallback_cache[word] = count
139 | 
140 |     return count
141 | 
142 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | utility functions for breaking down a given block of text
 3 | into it's component syntactic parts.
 4 | """
 5 | 
 6 | import nltk
 7 | 
 8 | from nltk.tokenize import RegexpTokenizer
 9 | import syllables_en
10 | 
11 | TOKENIZER = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+')
12 | SPECIAL_CHARS = ['.', ',', '!', '?']
13 | 
14 | def get_char_count(words):
15 |     characters = 0
16 |     for word in words:
17 |         characters += len(word.decode("utf-8"))
18 |     return characters
19 |     
20 | def get_words(text=''):
21 |     words = []
22 |     words = TOKENIZER.tokenize(text)
23 |     filtered_words = []
24 |     for word in words:
25 |         if word in SPECIAL_CHARS or word == " ":
26 |             pass
27 |         else:
28 |             new_word = word.replace(",","").replace(".","")
29 |             new_word = new_word.replace("!","").replace("?","")
30 |             filtered_words.append(new_word)
31 |     return filtered_words
32 | 
33 | def get_sentences(text=''):
34 |     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
35 |     sentences = tokenizer.tokenize(text)
36 |     return sentences
37 | 
38 | def count_syllables(words):
39 |     syllableCount = 0
40 |     for word in words:
41 |         syllableCount += syllables_en.count(word)
42 |     return syllableCount
43 | 
44 | #This method must be enhanced. At the moment it only
45 | #considers the number of syllables in a word.
46 | #This often results in that too many complex words are detected.
47 | def count_complex_words(text=''):
48 |     words = get_words(text)
49 |     sentences = get_sentences(text)
50 |     complex_words = 0
51 |     found = False
52 |     cur_word = []
53 |     
54 |     for word in words:          
55 |         cur_word.append(word)
56 |         if count_syllables(cur_word)>= 3:
57 |             
58 |             #Checking proper nouns. If a word starts with a capital letter
59 |             #and is NOT at the beginning of a sentence we don't add it
60 |             #as a complex word.
61 |             if not(word[0].isupper()):
62 |                 complex_words += 1
63 |             else:
64 |                 for sentence in sentences:
65 |                     if str(sentence).startswith(word):
66 |                         found = True
67 |                         break
68 |                 if found: 
69 |                     complex_words += 1
70 |                     found = False
71 |                 
72 |         cur_word.remove(word)
73 |     return complex_words
74 | 
75 | 


--------------------------------------------------------------------------------