├── autocomplete ├── tests │ ├── __init__.py │ └── test_autocomplete.py ├── helpers.py ├── __init__.py ├── autocomplete.py └── models.py ├── MANIFEST.in ├── setup.cfg ├── bin └── autocomplete_server.py ├── .gitignore ├── setup.py ├── README.rst └── README.md /autocomplete/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include autocomplete/big.txt 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst 3 | -------------------------------------------------------------------------------- /bin/autocomplete_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import autocomplete 4 | 5 | autocomplete.run_server() 6 | -------------------------------------------------------------------------------- /autocomplete/helpers.py: -------------------------------------------------------------------------------- 1 | """This file contains a collection of useful and concise functions gathered 2 | from across the Web""" 3 | 4 | import re 5 | 6 | def norm_rsplit(text,n): return text.lower().rsplit(' ', n)[-n:] 7 | 8 | #http://norvig.com/spell-correct.html 9 | def re_split(text): return re.findall('[a-z]+', text.lower()) 10 | 11 | #http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python 12 | #https://github.com/rrenaud/Gibberish-Detector/blob/master/gib_detect_train.py#L16 13 | def chunks(l, n): 14 | for i in range(0, len(l) - n + 1): 15 | yield l[i:i+n] 16 | -------------------------------------------------------------------------------- /autocomplete/tests/test_autocomplete.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | 5 | 6 | class TestLoadWordsModel(TestCase): 7 | def test_load_models(self): 8 | import autocomplete 9 | 10 | is_loaded = autocomplete.load() 11 | self.assertTrue(is_loaded) 12 | 13 | 14 | def test_WORDS_MODEL_not_loaded(self): 15 | from collections import Counter 16 | 17 | from autocomplete import models 18 | 19 | self.assertFalse(len(models.WORDS_MODEL.keys()) > 0) 20 | 21 | def test_WORD_PAIRS_MODEL_not_loaded(self): 22 | from autocomplete import models 23 | 24 | self.assertFalse(len(models.WORD_TUPLES_MODEL.keys()) > 0) 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Pickle'd python objects 9 | .pkl 10 | 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | def readme(): 4 | with open('README.rst') as f: 5 | return f.read() 6 | 7 | setup(name='autocomplete', 8 | version='0.0.104', 9 | description='tiny \'autocomplete\' tool using a "hidden markov model"', 10 | keywords='autocomplete autosuggest suggest complete spell spellsuggest \ 11 | hidden markov model HMM hmm markov chain iPhone iphone suggest \ 12 | Google suggest search as you type searchsuggest type spell \ 13 | automatic spelling word suggest machine learning ai text \ 14 | conditional probability model probabilistic perspective \ 15 | Rodrigo Palacios rodrigo palacios im-rodrigo im_rodrigo \ 16 | rodricios', 17 | 18 | author='Rodrigo Palacios', 19 | author_email='rodrigopala91@gmail.com', 20 | license='MIT', 21 | packages=['autocomplete'], 22 | install_requires=['bottle'], 23 | url='https://github.com/rodricios/autocomplete', 24 | scripts=['bin/autocomplete_server.py'], 25 | package_data={'autocomplete': ['autocomplete/big.txt']}, 26 | test_suite='nose.collector', 27 | tests_require=['nose'], 28 | include_package_data=True, 29 | zip_safe=False) 30 | -------------------------------------------------------------------------------- /autocomplete/__init__.py: -------------------------------------------------------------------------------- 1 | """autocomplete - or How to "suggest" the completion of an unfinished word 2 | using a simple conditional probability model. 3 | 4 | written by Rodrigo Palacios 5 | rodrigopala91@gmail.com 6 | 7 | find me on GitHub or twitter: 8 | http://github.com/rodricios 9 | http://twitter.com/rodricios 10 | - Copyright 2015 11 | 12 | Notes: 13 | 14 | There are two works that have greatly inspired this and my last Python modules. 15 | 16 | The first work is by Peter Norvig, a Director of Research @ Google (according 17 | to his wiki page): 18 | 19 | How to Write a Spelling Corrector: 20 | http://norvig.com/spell-correct.html 21 | 22 | I also suggest watching his lecture The Unreasonable Effectiveness of Data: 23 | https://www.youtube.com/watch?v=yvDCzhbjYWs 24 | 25 | The second is by Rob Renaud who states (in his project's README) that he also 26 | felt inspired and challenged by Peter Norvig's lecture. 27 | 28 | rrenaud's Gibberish-Detector: 29 | https://github.com/rrenaud/Gibberish-Detector 30 | 31 | Finally, the implied challenge issued by Norvig is to try to come up with a 32 | simple solution to some problem using lots of data. He [probabilistically] 33 | solved the spell-checker problem by using text he found within his computer (not 34 | pulled from the internet). This data is contained within big.txt (6mb). I borrow 35 | this corpus, as did Renaud; you will likely see a lot of similarities between 36 | mine, Renaud's, and Norvig's Python projects. That's the point. Please feel 37 | free to send me any questions and comments to my email: rodrigopala91@gmail.com 38 | 39 | Cheers, 40 | Rodrigo 41 | """ 42 | 43 | from bottle import route, run, debug 44 | 45 | from autocomplete import models 46 | 47 | from .autocomplete import predict 48 | 49 | def run_server(port_num=8080): 50 | """little demo server for demo'ing sake""" 51 | models.load_models() 52 | 53 | debug(True) 54 | 55 | @route('//') 56 | def index(first_word, second_word): 57 | return dict(predict(first_word, second_word)) 58 | 59 | run(host='localhost', port=port_num) 60 | 61 | 62 | def load(): 63 | """load the classic Norvig big.txt corpus""" 64 | print("training!") 65 | 66 | models.load_models() 67 | 68 | print("done training!") 69 | 70 | return True 71 | 72 | -------------------------------------------------------------------------------- /autocomplete/autocomplete.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | from . import models 4 | 5 | from . import helpers 6 | 7 | #the so called "Hidden" step, thus allowing this module to be 8 | #a "Hidden Markov Model"... Whatever that means... 9 | NEARBY_KEYS = { 10 | 'a': 'qwsz', 11 | 'b': 'vghn', 12 | 'c': 'xdfv', 13 | 'd': 'erfcxs', 14 | 'e': 'rdsw', 15 | 'f': 'rtgvcd', 16 | 'g': 'tyhbvf', 17 | 'h': 'yujnbg', 18 | 'j': 'uikmnh', 19 | 'k': 'iolmj', 20 | 'l': 'opk', 21 | 'm': 'njk', 22 | 'n': 'bhjm', 23 | 'o': 'iklp', 24 | 'p': 'ol', 25 | 'q': 'wa', 26 | 'r': 'edft', 27 | 's': 'wedxza', 28 | 't': 'rfgy', 29 | 'u': 'yhji', 30 | 'v': 'cfgb', 31 | 'w': 'qase', 32 | 'x': 'zsdc', 33 | 'y': 'tghu', 34 | 'z': 'asx' 35 | } 36 | 37 | 38 | def this_word(word, top_n=10): 39 | """given an incomplete word, return top n suggestions based off 40 | frequency of words prefixed by said input word""" 41 | try: 42 | return [(k, v) for k, v in models.WORDS_MODEL.most_common() 43 | if k.startswith(word)][:top_n] 44 | except KeyError: 45 | raise Exception("Please load predictive models. Run:\ 46 | \n\tautocomplete.load()") 47 | 48 | 49 | predict_currword = this_word 50 | 51 | 52 | def this_word_given_last(first_word, second_word, top_n=10): 53 | """given a word, return top n suggestions determined by the frequency of 54 | words prefixed by the input GIVEN the occurence of the last word""" 55 | 56 | #Hidden step 57 | possible_second_words = [second_word[:-1]+char 58 | for char in NEARBY_KEYS[second_word[-1]] 59 | if len(second_word) > 2] 60 | 61 | possible_second_words.append(second_word) 62 | 63 | probable_words = {w:c for w, c in 64 | models.WORD_TUPLES_MODEL[first_word.lower()].items() 65 | for sec_word in possible_second_words 66 | if w.startswith(sec_word)} 67 | 68 | return Counter(probable_words).most_common(top_n) 69 | 70 | 71 | predict_currword_given_lastword = this_word_given_last 72 | 73 | 74 | def predict(first_word, second_word, top_n=10): 75 | """given the last word and the current word to complete, we call 76 | predict_currword or predict_currword_given_lastword to retrive most n 77 | probable suggestions. 78 | """ 79 | 80 | try: 81 | if first_word and second_word: 82 | return predict_currword_given_lastword(first_word, 83 | second_word, 84 | top_n=top_n) 85 | else: 86 | return predict_currword(first_word, top_n) 87 | except KeyError: 88 | raise Exception("Please load predictive models. Run:\ 89 | \n\tautocomplete.load()") 90 | 91 | 92 | def split_predict(text, top_n=10): 93 | """takes in string and will right split accordingly. 94 | Optionally, you can provide keyword argument "top_n" for 95 | choosing the number of suggestions to return (default is 10)""" 96 | text = helpers.norm_rsplit(text, 2) 97 | return predict(*text, top_n=top_n) 98 | -------------------------------------------------------------------------------- /autocomplete/models.py: -------------------------------------------------------------------------------- 1 | """AUTOCOMPLETE - 2 | This file contains the process where we train our predictive models, Also 3 | helpful are the load_models and save_models functions. 4 | """ 5 | 6 | import os 7 | 8 | import collections 9 | 10 | import pickle 11 | 12 | from . import helpers 13 | 14 | WORDS = [] 15 | 16 | WORD_TUPLES = [] 17 | 18 | WORDS_MODEL = {} 19 | 20 | WORD_TUPLES_MODEL = {} 21 | 22 | #This step is where we transform "raw" data 23 | # into some sort of probabilistic model(s) 24 | def train_models(corpus, model_name="models_compressed.pkl"): 25 | """Takes in a preferably long string (corpus/training data), 26 | split that string into a list, we \"chunkify\" resulting in 27 | a list of 2-elem list. Finally we create a dictionary, 28 | where each key = first elem and each value = Counter([second elems]) 29 | 30 | Will save/pickle model by default ('models_compressed.pkl'). 31 | Set second argument to false if you wish to not save the models. 32 | """ 33 | 34 | # "preperation" step 35 | # word is in WORDS 36 | global WORDS 37 | WORDS = helpers.re_split(corpus) 38 | 39 | # first model -> P(word) 40 | global WORDS_MODEL 41 | WORDS_MODEL = collections.Counter(WORDS) 42 | 43 | # another preperation step 44 | # wordA, wordB are in WORDS 45 | global WORD_TUPLES 46 | WORD_TUPLES = list(helpers.chunks(WORDS, 2)) 47 | 48 | # second model -> P(wordA|wordB) 49 | global WORD_TUPLES_MODEL 50 | WORD_TUPLES_MODEL = {first:collections.Counter() 51 | for first, second in WORD_TUPLES} 52 | 53 | for tup in WORD_TUPLES: 54 | try: 55 | WORD_TUPLES_MODEL[tup[0]].update([tup[1]]) 56 | except: 57 | # hack-y fix for uneven # of elements in WORD_TUPLES 58 | pass 59 | 60 | if model_name: 61 | save_models(os.path.join(os.path.dirname(__file__), model_name)) 62 | 63 | 64 | def train_bigtxt(): 65 | """unnecessary helper function for training against 66 | default corpus data (big.txt)""" 67 | 68 | bigtxtpath = os.path.join(os.path.dirname(__file__), 'big.txt') 69 | with open(bigtxtpath, 'rb') as bigtxtfile: 70 | 71 | train_models(str(bigtxtfile.read())) 72 | 73 | 74 | def save_models(path=None): 75 | """Save models to 'path'. If 'path' not specified, 76 | save to module's folder under name 'models_compressed.pkl'""" 77 | 78 | if path == None: 79 | path = os.path.join(os.path.dirname(__file__), 'models_compressed.pkl') 80 | 81 | print("saving to:", path) 82 | #save for next use. pickle format: (key=model name, value=model) 83 | pickle.dump({'words_model': WORDS_MODEL, 84 | 'word_tuples_model': WORD_TUPLES_MODEL}, 85 | open(path, 'wb'), 86 | protocol=2) 87 | 88 | 89 | def load_models(load_path=None): 90 | """Load autocomplete's built-in model (uses Norvig's big.txt). Optionally 91 | provide the path to Python pickle object.""" 92 | 93 | if load_path is None: 94 | load_path = os.path.join(os.path.dirname(__file__), 95 | 'models_compressed.pkl') 96 | try: 97 | models = pickle.load(open(load_path,'rb')) 98 | 99 | global WORDS_MODEL 100 | WORDS_MODEL = models['words_model'] 101 | 102 | global WORD_TUPLES_MODEL 103 | WORD_TUPLES_MODEL = models['word_tuples_model'] 104 | 105 | print("successfully loaded: models_compressed.pkl") 106 | except IOError: 107 | print("Error in opening pickle object. Training on default corpus text.") 108 | train_bigtxt() 109 | except KeyError: 110 | print("Error in loading both predictve models.\ 111 | Training on default corpus text.") 112 | train_bigtxt() 113 | except ValueError: 114 | print("Corrupted pickle string.\ 115 | Training on default corpus text (big.txt)") 116 | train_bigtxt() 117 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | *Autocomplete* or: How I learned to stop spelling and love our AI overlords 2 | =========================================================================== 3 | 4 | A practical guide to implementing "autocomplete"! It follows the 5 | sometimes misunderstood principles of conditional probability 6 | distributions and the generalized Hidden Markov Model (HMM). 7 | 8 | Fun fact: Your iPhone's "autocomplete" was implemented using a HMM! Plus 9 | the extra stuff it chose to `sue Samsung 10 | for `__. 11 | 12 | Skip to: 13 | -------- 14 | 15 | - `How to's <#how-to-install>`__ 16 | - `tl;dr? <#tldr>`__ 17 | - `Motivation <#motivation>`__ 18 | - `ELI5 <#explain-like-im-5>`__ 19 | - `If you're not 5 <#if-youre-not-5>`__ 20 | 21 | -------------- 22 | 23 | How to install: 24 | --------------- 25 | 26 | :: 27 | 28 | pip install autocomplete 29 | 30 | How to use: 31 | ----------- 32 | 33 | .. code:: python 34 | 35 | import autocomplete 36 | 37 | # load pickled python Counter objects representing our predictive models 38 | # I use Peter Norvigs big.txt (http://norvig.com/big.txt) to create the predictive models 39 | autocomplete.load() 40 | 41 | # imagine writing "the b" 42 | autocomplete.predict('the','b') 43 | 44 | [('blood', 204), 45 | ('battle', 185), 46 | ('bone', 175), 47 | ('best', 149), 48 | ('body', 149), 49 | ...] 50 | 51 | # now you type an "o" 52 | 53 | autocomplete.predict('the','bo') 54 | 55 | [('bone', 175), 56 | ('body', 149), 57 | ('bones', 122), 58 | ('boy', 46), 59 | ('bottom', 32), 60 | ('box', 24), 61 | ...] 62 | 63 | If you have your own language model in the form described in 64 | `ELI5 <#explain-like-im-5>`__, then use the *models* submodule to call 65 | the training method: 66 | 67 | .. code:: python 68 | 69 | from autocomplete import models 70 | 71 | models.train_models('some giant string of text') 72 | 73 | Want to run it as a server (bottlepy required)? 74 | 75 | .. code:: python 76 | 77 | import autocomplete 78 | 79 | autocomplete.run_server() 80 | 81 | #output 82 | Bottle v0.12.8 server starting up (using WSGIRefServer())... 83 | Listening on http://localhost:8080/ 84 | Hit Ctrl-C to quit. 85 | 86 | Now head over to http://localhost:8080/the/bo 87 | 88 | :: 89 | 90 | http://localhost:8080/the/bo 91 | #output 92 | {"body": 149, "box": 24, "bottom": 32, "boy": 46, "borzois": 16, "bodies": 13, "bottle": 13, "bones": 122, "book": 14, "bone": 175} 93 | 94 | http://localhost:8080/the/bos 95 | #output 96 | {"boscombe": 11, "boston": 7, "boss": 1, "bosom": 5, "bosses": 4} 97 | 98 | Obligatory tests 99 | ~~~~~~~~~~~~~~~~ 100 | 101 | :: 102 | 103 | python setup.py test 104 | 105 | -------------- 106 | 107 | `tl;dr `__ 108 | ---------------------------------------------------------------------------------------- 109 | 110 | The following code excerpt is my interpretation of a series of 111 | lessons/concepts expressed in a number of different books. 112 | 113 | The unifying concept can be said to be `conditional 114 | probability `__: 115 | 116 | :: 117 | 118 | P(A , B) = P(B | A) * P(A) 119 | 120 | Which can read as saying: 121 | 122 | :: 123 | 124 | The probability of A and B occuring is equal to the probability of B occuring, given that A has occured 125 | 126 | More on this below. 127 | 128 | .. code:: python 129 | 130 | 131 | # "preperation" step 132 | # for every word in corpus, normalize ('The' -> 'the'), insert to list 133 | WORDS = helpers.re_split(corpus) 134 | 135 | # first model -> P(word) 136 | # Counter constructor will take a list of elements and create a frequency distribution (histogram) 137 | WORDS_MODEL = collections.Counter(WORDS) 138 | 139 | # another preperation step 140 | # [a,b,c,d] -> [[a,b], [c,d]] 141 | WORD_TUPLES = list(helpers.chunks(WORDS, 2)) 142 | 143 | # second model -> P(next word | prev. word) 144 | # I interpret "..| prev. word)" as saying "dictionary key 145 | # leading to seperate and smaller (than WORDS_MODEL) freq. dist. 146 | WORD_TUPLES_MODEL = {first:collections.Counter() for first, second in WORD_TUPLES} 147 | 148 | for prev_word, next_word in WORD_TUPLES: 149 | # this is called the "conditioning" step where we assert 150 | # that the probability space of all possible "next_word"'s 151 | # is "conditioned" under the event that "prev_word" has occurred 152 | WORD_TUPLES_MODEL[prev_word].update([next_word]) 153 | 154 | Textbooks, and locations therein, where the concept-in-practice has been 155 | expressed: 156 | 157 | I. `Intro to Statistical Natural Language 158 | Processing `__ 159 | - Manning, Schütze, 1999 160 | 161 | :: 162 | 163 | a. frequency distribution showing the most common words and frequencies in *Tom Sawyer*, pg. 21 164 | 165 | b. conditional probability definition expressed in page 42 - section 2.1.2 166 | 167 | c. the intuition for *frequency* distributions found in pg. 153 (provided in the context of finding [*Collocations*](http://en.wikipedia.org/wiki/Collocation)) 168 | 169 | II. `Probabilistic Graphical 170 | Models `__ 171 | - Kohler, Friedman, 2009 172 | 173 | a. conditional probability definition found on pg. 18 (hilariously 174 | and coincidentally found in section 2.1.2.1) 175 | 176 | III. `Artificial Intelligence - A Modern 177 | Approach `__ - Russell, Norvig, 3rd. 178 | ed. 2010 179 | 180 | a. conditional probability concept explained in pg. 485 181 | 182 | b. the "language" (I take to mean "intuition" for asserting things 183 | in the probabilistic sense) pg. 486 184 | 185 | c. the notion of "conditioning" found in pg. 492-494 186 | 187 | Motivation 188 | ---------- 189 | 190 | Similar to the motivation behind 191 | `eatiht `__, I found 192 | that it took far too long to find a palpable theory-to-application 193 | example of what amounts to more than a 500 pages of words across 3 194 | books, each spanning a large index of, in certain cases, 195 | *counter-intuitive* nomenclature; read the `light 196 | criticisms `__ 197 | made by Michael I. Jordan on the matter (he was recently named `#2 198 | machine learning expert "we need to know" on 199 | dataconomy.com `__). 200 | 201 | You can find similar thoughts being expressed `**in an article from 2008 202 | (updated 203 | 2009)** `__ 204 | by `Brennan O'Connor `__ 205 | 206 | -------------- 207 | 208 | `*This work is dedicated to my siblings* <#note-1>`__. 209 | 210 | Explain like I'm 5\ `\* <#note-1>`__ 211 | ------------------------------------ 212 | 213 | \*Warning! This explanation is literally intended for young kids - I'm 214 | actually trying to see if these concepts can be explained to an audience 215 | unaware of the nomenclature used within the statistical 216 | `nlp `__ and 217 | other machine learning fields. For example, my 7, 9, 11, 14 y.o. 218 | siblings, and basically anyone else who's ever read a story to a child - 219 | they would be a part of the target audience. 220 | 221 | If you've found this readable and informative, please consider putting 222 | on the goofiest face and reading this to your kids, if you have any :) 223 | If you do, please send me your thoughts on the experience. 224 | 225 | I'm only interested in lowering the barrier to entry. I should have 226 | included this note since the beginning (sorry to those who undoubtedly 227 | left with a bad taste in their mouths). 228 | 229 | You can contact me at rodrigopala91@gmail.com 230 | 231 | Thanks for reading, 232 | 233 | Rodrigo 234 | 235 | ELI5 236 | ---- 237 | 238 | No. I'm explaining this like you're 5. I know you're not *5* , *you 239 | guys... Chris, stop jumping on your sister's back*! 240 | 241 | Ok, so I'm saying, *imagine I'm 5!* 242 | 243 | Oh, that was easy now huh? Let's just forget the *I'm 5* part. 244 | 245 | Imagine a giant collection of books. 246 | 247 | For example, all the Harry Potter and Hunger Games novels put together. 248 | 249 | What if I asked you to go through all the pages and all the words in 250 | those pages? 251 | 252 | Now I'm not asking you *four* to actually *read* the books. You know, 253 | just go through, beginning to end, and notice each word. 254 | 255 | For every new word you see, write it down, and put a "1" next to it, and 256 | everytime you see a word *again*, add "1" more to the previous number. 257 | 258 | So basically I'm asking y'all to keep count of how many times a word 259 | comes up. 260 | 261 | Got it? If yes, cool! If not, find a sibling, friend, or adult near you 262 | and ask them to help you out :) 263 | 264 | ... 265 | 266 | Say you start with *Harry Potter and the Sorcerer's Stone*: 267 | 268 | :: 269 | 270 | Mr. and Mrs. Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much... 271 | 272 | And imagine that you're on the 5th word. This or something close to this 273 | is what you're going for: 274 | 275 | :: 276 | 277 | Mr. -> 1 278 | and -> 1 279 | Mrs. -> 1 280 | Dursley -> 1 281 | of -> 1 282 | 283 | Or if you're a *wannabe-Harry-Potter* fan, ah I'm just kidding! 284 | 285 | If you started with *the-book-that-must-not-be-named* - I know you guys 286 | won't get it, but persons my age will :) 287 | 288 | Alright! So you started with *The Hunger Games*: 289 | 290 | :: 291 | 292 | When I wake up, the other side of the bed is cold... 293 | 294 | By the sixth word you have: 295 | 296 | :: 297 | 298 | When -> 1 299 | I -> 1 300 | wake -> 1 301 | up -> 1 302 | the -> 1 303 | 304 | You have a long day ahead of you... 305 | 306 | ... 307 | 308 | *1,105,285 words later* 309 | 310 | Now that you're done tallying up all those words, why not order all 311 | these words by the *number of times you've seen them*? 312 | 313 | See you next week! 314 | 315 | ... 316 | 317 | Back so soon? You should have gotten something like this: 318 | 319 | :: 320 | 321 | psst*, remember, the format is: 322 | word -> # of times the word appears 323 | 324 | 'the' -> 80030 325 | 'of' -> 40025 326 | 'and' -> 38313 327 | 'to' -> 28766 328 | 'in' -> 22050 329 | 'a' -> 21155 330 | 'that'-> 12512 331 | 'he' -> 12401 332 | 'was' -> 11410 333 | 'it' -> 10681 334 | ... there's a lot more words you've tallied up... 335 | 336 | Those were the most common words. 337 | 338 | Now on the *less-frequent* end, you'll find your words appearing not as 339 | often... 340 | 341 | :: 342 | 343 | ... 29137 words later. 344 | 'przazdziecka' -> 1 345 | 'disclosure' -> 1 346 | 'galvanism' -> 1 347 | 'repertoire' -> 1 348 | 'bravado' -> 1 349 | 'gal' -> 1 350 | 'ideological' -> 1 351 | 'guaiacol' -> 1 352 | 'expands' -> 1 353 | 'revolvers' -> 1 354 | 355 | Yeah Chris? Oh, 'what does *lez freekend*' mean? Um, so it means 356 | something like: *you probably won't hear or read that word very often.* 357 | 358 | Now what if I asked you to help me find this word I'm looking for? And I 359 | know this word starts with the letters: 'th'. 360 | 361 | I'm pretty sure you guys can do this much faster! 362 | 363 | ... 364 | 365 | *5 minutes later!* 366 | 367 | ... 368 | 369 | Not bad! You only had to go through 29157 unique words after all! 370 | 371 | :: 372 | 373 | 'the' -> 80030 374 | 'that' -> 12512 375 | 'this' -> 4063 376 | 'they' -> 3938 377 | 'there'-> 2972 378 | 'their'-> 2955 379 | 'them' -> 2241 380 | 'then' -> 1558 381 | 'these'-> 1231 382 | 'than' -> 1206 383 | ... 229 words more... 384 | 385 | 239 words, still kind of lot though huh? And you know your big brother, 386 | he's too lazy to do this work *by hand* (*cough* program it up *cough*) 387 | ;) 388 | 389 | So the word I'm looking for is on the tip of my tongue. I think the next 390 | letter is "i". 391 | 392 | *1 minute later* 393 | 394 | :: 395 | 396 | 'this' -> 4063 397 | 'think' -> 557 398 | 'things' -> 321 399 | 'thing' -> 303 400 | 'third' -> 239 401 | 'thin' -> 166 402 | 'thinking' -> 137 403 | 'thirty' -> 123 404 | 'thick' -> 77 405 | 'thirds' -> 43 406 | ... 36 words more... 407 | 408 | *I scan through the first 10 words.* Oh, I just remembered that the next 409 | letter is 'r'. 410 | 411 | *You start taking out even more words.* 412 | 413 | *10 seconds later.* 414 | 415 | :: 416 | 417 | 'third' -> 239 418 | 'thirty' -> 123 419 | 'thirds' -> 43 420 | 'thirteen' -> 32 421 | 'thirst' -> 13 422 | 'thirteenth' -> 11 423 | 'thirdly' -> 8 424 | 'thirsty' -> 5 425 | 'thirtieth' -> 3 426 | 'thirties' -> 2 427 | 428 | Aha, 'thirdly' was the word I was looking for! What, you never heard of 429 | the word "thirdly" before? 430 | 431 | Now you might be saying to yourself, "*that's pretty cool!*\ ", and 432 | you're right! 433 | 434 | And you know what's cooler? *Making everyone's life a tiny bit easier* 435 | is! :) 436 | 437 | But how can you do that with just *words*? 438 | 439 | Aren't words boring and dull? 440 | 441 | It's like all we do is talk, write, and think with *words*. I mean, how 442 | lame, I can't even describe to you this *autocomplete* 443 | thing-slash-idea-thing without having to write it out with *words*! 444 | 445 | Ugh! I hate words! 446 | 447 | *Whoah, wait a minute! That was not cool of me! Let's relax for a 448 | minute.* 449 | 450 | Let's try to give an imaginary hug to the word-factory in our brains. 451 | That part of our brain works so hard, even when we don't ask it to. How 452 | nice of our brain to do that. Not! 453 | 454 | What I'm trying to is sometimes it's not so nice for our brains to 455 | distract us, especially when we have homework or other, real-world, 456 | problems like adult-homework. 457 | 458 | So how about this: let's try to think about *what* the next sentence 459 | coming out of our own mouths *will be*\ `\* <#note-2>`__. 460 | 461 | Now if you're thinking about what will be coming out of my mouth, or out 462 | of your mouth, or your mouth, or your mouth, or your mouth, you're doing 463 | it wrong! (to readers who aren't one of my 4 younger siblings, that's 464 | how many I have). 465 | 466 | Try your best to think about *what* the next sentence coming out of 467 | *your own* mouth will be. 468 | 469 | ... 470 | 471 | Did you decide on your sentence? Good! 472 | 473 | Now what if I asked you to give me two reasons explaining *why* and 474 | *how* you chose the sentence you chose? 475 | 476 | Wait, I can't even do that! Let's make it easier on ourselves and 477 | explain *why* and *how* we chose the first *word*. 478 | 479 | Still pretty hard huh? 480 | 481 | If you thought about it, and you thought it was pretty darn hard to give 482 | a *good and honest* reason as to why it is you chose the word you chose, 483 | let's bring out a word you guys might not understand: *probability*. 484 | 485 | If you feel like you don't *get* what the word means, sure you do! Just 486 | use the word "probably" in one of your sentences, but but try to makes 487 | some sense. 488 | 489 | What do I mean? Well, let's just consider the English language. Like 490 | most other things, the English language has rules. 491 | 492 | The kind of rules that can be simplified down to: 493 | 494 | 1) "***something*** *action* ***something***". 495 | 496 | 2) Replace ***something***'s and ***action*** with words that make sense 497 | to you. 498 | 499 | Fair enough, right? 500 | 501 | Now, imagine you could put "pause" right after the first word that comes 502 | out of your mouth. 503 | 504 | Let's just say that first word is "the". 505 | 506 | Now in the case that you stuttered for reasons outside your 507 | conscientious control (for example: "thhh thhe the"). No big deal, you 508 | meant to say "the", so let's *flatten* it to just that! 509 | 510 | With that *word* said, what words do you *think* you might have said 511 | after it? 512 | 513 | You might tell me, "*any word I want!* 514 | 515 | Of course you could have! I bet you spent a millisecond thinking about 516 | whether or not the next word you were going to say was going to be: 517 | *guaiacol*. 518 | 519 | I *know* because I thought about using that word too! 520 | 521 | I can remember the first time I heard (or read) *guaiacol* like it was 522 | yesterday. I read it in some funky article on the internet. I found the 523 | word in a list of words that don't appear too often in the English 524 | language. 525 | 526 | After I read it, I was able to fit *guaiacol* nicely into that part of 527 | my brain where I... uhh.. was... able... uhh... 528 | 529 | Oh, you *know*, that place in my brain where I get to choose whether I 530 | want to say *the apple*, *the automobile*, *the austronaut*, etc. 531 | 532 | ... 533 | 534 | Ok, so clearly I'm no brainician, and that may or may not be the way our 535 | brain works - actually, it's probably super super unlikely. 536 | 537 | But even though that idea is probably wrong, the idea itself sounds like 538 | a pretty darn good way of suggesting the next word or words somebody is 539 | trying to *type*. 540 | 541 | What if you had a way to count the number of times you've heard "apple" 542 | said after the word "the"? 543 | 544 | Ask yourself the same question, but now with the word "automobile" 545 | instead of "apple". 546 | 547 | What if you had the time to think about every possible word that you've 548 | ever heard spoken after the word "the"? I'd say it might have looked 549 | something like this: 550 | 551 | :: 552 | 553 | Words you might have heard following the word "the" and the number of times you might have heard it 554 | 555 | 'same' -> 996 556 | 'french' -> 688 557 | 'first' -> 652 558 | 'old' -> 591 559 | 'emperor' -> 581 560 | 'other' -> 528 561 | 'whole' -> 500 562 | 'united' -> 466 563 | 'room' -> 376 564 | 'most' -> 373 565 | 566 | ... 9331 more words... 567 | 568 | Not impressed with your brain yet? Let's continue this little thought 569 | experiment further. 570 | 571 | Imagine that you just said "the", and you could put pause after the 572 | first *letter* of the next word out of your mouth: "h". 573 | 574 | Real quick, think of the shortest amount of time you can think of. Think 575 | of the shortest *second* you can think of. Now shorter than that too. 576 | 577 | At this point, you can't even call that length of time a *second*. But 578 | in that length of time, your brain may have just done this: 579 | 580 | :: 581 | 582 | Every word you've ever heard coming after the word "the": 583 | 584 | 'house' -> 284 585 | 'head' -> 117 586 | 'hands' -> 101 587 | 'hand' -> 97 588 | 'horses' -> 71 589 | 'hill' -> 64 590 | 'highest' -> 64 591 | 'high' -> 57 592 | 'history' -> 56 593 | 'heart' -> 55 594 | 595 | And that brain you got did this realllllyyyyyy fast. Faster than Google, 596 | Bing, Yahoo and any other company can ever hope to beat. And your brain 597 | did this without even asking for your permission. I think our brains are 598 | trying to control us you guys, oh no! 599 | 600 | If you're not 5 601 | --------------- 602 | 603 | The basic idea is this: 604 | 605 | Assume you have a large collection of Enlish-understandable text merged 606 | into a single string. 607 | 608 | Start by transforming that string into a list of words (AKA *ngrams of 609 | word-legth*), and also (but not required) normalize each word ('The' -> 610 | 'the'). 611 | 612 | Once you have a normalized list of words, you can start building a 613 | frequency distribution measuring the frequency of each word. 614 | 615 | ... 616 | 617 | At this point you can start "predict" the "final state" of a 618 | word-in-progress. But consider the case where a user types in some query 619 | box: 620 | 621 | :: 622 | 623 | "The th" 624 | 625 | And he intends to write: 626 | 627 | :: 628 | 629 | "The third" 630 | 631 | With the above predictive model, you'll be suggesting something like: 632 | 633 | :: 634 | 635 | [ 636 | ('the', 80030), 637 | ('they', 3938), 638 | ('there', 2972), 639 | ... 640 | ] 641 | 642 | This explains one specific type of predictive model, which can be 643 | written as P(word), and you've just seen the pitfalls of using **just** 644 | this model. 645 | 646 | Now for the next word, ask yourself, what's the probability that I'm 647 | going to type the word "apple" given that I wrote "tasty"? 648 | 649 | In machine learning and AI books, you'll be presented *Conditional 650 | Probability* with the following equation: 651 | 652 | :: 653 | 654 | P(word A and word B) = P(word B | word A) * P(word A) 655 | 656 | That equation addresses the problem that I mentioned. 657 | 658 | We've handled P(wordA) already. 659 | 660 | To handle P(word B \| word A), which reads *probability of word A given 661 | word B *, I take a *literall* interpretation of the word "given", in 662 | that context, to mean the following: 663 | 664 | *"word A" is the key pointing to a probability distribution representing 665 | all the words that follow "word A"* 666 | 667 | Once we can represent this second model, we can also apply the 668 | *filtering* step - given that we know more letters in the second word, 669 | we can zone in on more precise suggestions. 670 | 671 | -------------- 672 | 673 | Afterword 674 | ~~~~~~~~~ 675 | 676 | notes: \*I have to give a shout out to `Sam 677 | Harris `__ for being, AFAIK, the first 678 | person or one of the firsts, in `wonderfully putting into 679 | words `__ what I've 680 | borrowed and slightly adapted for this writing. `I highly recommend his 681 | work `__ 682 | 683 | Another shoutout to `Peter Norvig `__ for inspiring 684 | me and probably many others with our own little "toy" programs. His 685 | *Occam's Razor* approach to problem solving will likely cause some 686 | confusion as it may appear that my work is an almost full on copy-paste 687 | of his `*How to Write a Spell 688 | Checker* `__! 689 | 690 | But I swear it's not! I actually I think I may have out-Norvig'ed Peter 691 | Norvig when it comes to describing `conditional 692 | probability `__: 693 | P(wordA & wordB) = P(wordB \| wordA)\*P(wordA) 694 | 695 | And another one to Rob Renaud's `Gibberish 696 | Detector `__. I, out of 697 | pure chance, ran into his project some time after running into Norvig's 698 | article. I can't describe *how much it helped* to intuitively understand 699 | what the heavy hitters of "AI" consider to be introductory material; 700 | this was greatly needed b/c at the time, I felt overwhelmed by my own 701 | desire to really understand this area, and everything else going on. 702 | 703 | I do have a second article about this exact thing, only expressed 704 | differently (audience is non-programming), and it may or may not be 705 | posted soon! [STRIKEOUT:Oh and the code too, that is if someone hasn't 706 | gotten to translating the above article to code before I can get to 707 | uploading the project :P I'm trying to get the kinks out of here and the 708 | code so it's simple, duh!] 709 | 710 | I dedicate this work to my sisters, Cat, Melissa and Christine, and my 711 | favorite brother, Christian :) 712 | 713 | note 1 714 | ^^^^^^ 715 | 716 | `go back <#explain-like-im-5>`__ 717 | 718 | *To avoid confusion, I wrote this section in the form of a letter to my 719 | younger siblings* 720 | 721 | note 2 722 | ^^^^^^ 723 | 724 | \*I'm borrowing, what I consider, `one of the most beautiful thought 725 | experiments I've ever heard trying to describe one's 726 | self `__. I'm a big 727 | fan of Sam Harris's work. Highly recommend! 728 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *Autocomplete* or: How I learned to stop spelling and love our AI overlords 2 | === 3 | 4 | Autocomplete is an adult and kid friendly exercise in creating your own AI program. 5 | 6 | For those short on time, the [ELI5](#explain-like-im-5) section is devoid of nomenclature but lengthy; the [tl;dr](#tldr) section describes the implementation using the appropriate terms - basic principles of conditional probability, generalized [Markov chain](http://en.wikipedia.org/wiki/Markov_chain) - but is short, concise, and includes references for further reading. 7 | 8 | ## Skip to: 9 | 10 | * [How to's](#how-to-install) 11 | * [New: Spell Correction](#spell-correction) 12 | * [tl;dr?](#tldr) 13 | * [Motivation](#motivation) 14 | * [ELI5](#explain-like-im-5) 15 | * [If you're not 5](#if-youre-not-5) 16 | * [Afterword](https://github.com/rodricios/autocomplete#afterword) 17 | 18 | 19 | --- 20 | 21 | ## How to install: 22 | 23 | pip install autocomplete 24 | 25 | ## How to use: 26 | 27 | ```python 28 | import autocomplete 29 | 30 | # load pickled python Counter objects representing our predictive models 31 | # I use Peter Norvigs big.txt (http://norvig.com/big.txt) to create the predictive models 32 | autocomplete.load() 33 | 34 | # imagine writing "the b" 35 | autocomplete.predict('the','b') 36 | 37 | [('blood', 204), 38 | ('battle', 185), 39 | ('bone', 175), 40 | ('best', 149), 41 | ('body', 149), 42 | ...] 43 | 44 | # now you type an "o" 45 | 46 | autocomplete.predict('the','bo') 47 | 48 | [('bone', 175), 49 | ('body', 149), 50 | ('bones', 122), 51 | ('boy', 46), 52 | ('bottom', 32), 53 | ('box', 24), 54 | ...] 55 | ``` 56 | 57 | ### Spell Correction 58 | 59 | Now say you are in the process of typing "body" (with a preceding "the") 60 | 61 | ```python 62 | 63 | autocomplete.predict('the','bo') 64 | 65 | [('bone', 175), 66 | ('body', 149), 67 | ('bones', 122), 68 | ('boy', 46), 69 | ('bottom', 32), 70 | ('box', 24), 71 | ...] 72 | 73 | ``` 74 | 75 | And then you make the fatal error of typing an "f" instead of a "d" 76 | 77 | ```python 78 | 79 | autocomplete.predict('the','bof') 80 | 81 | [('body', 149), 82 | ('bottom', 32), 83 | ('borzois', 16), 84 | ('bottle', 13), 85 | ('bodies', 13), 86 | ('border', 12) 87 | ...] 88 | 89 | ``` 90 | 91 | Relax! Autocomplete has you covered. Using a simple ["fat-finger"](http://en.wikipedia.org/wiki/Fat-finger_error) error model, 92 | you can rest assured that you won't be making [six-hundred billion dollar mistakes](http://www.bbc.com/news/business-29454265) at your Japanese investment firm. 93 | 94 | 95 | If you have your own language model in the form described in [ELI5](#explain-like-im-5), then use the *models* submodule to call the training method: 96 | 97 | ```python 98 | from autocomplete import models 99 | 100 | models.train_models('some giant string of text') 101 | 102 | ``` 103 | 104 | Want to run it as a server (bottlepy required)? 105 | 106 | ```python 107 | import autocomplete 108 | 109 | autocomplete.run_server() 110 | 111 | #output 112 | Bottle v0.12.8 server starting up (using WSGIRefServer())... 113 | Listening on http://localhost:8080/ 114 | Hit Ctrl-C to quit. 115 | 116 | ``` 117 | 118 | Now head over to http://localhost:8080/the/bo 119 | 120 | ``` 121 | http://localhost:8080/the/bo 122 | #output 123 | {"body": 149, "box": 24, "bottom": 32, "boy": 46, "borzois": 16, "bodies": 13, "bottle": 13, "bones": 122, "book": 14, "bone": 175} 124 | 125 | http://localhost:8080/the/bos 126 | #output 127 | {"boscombe": 11, "boston": 7, "boss": 1, "bosom": 5, "bosses": 4} 128 | ``` 129 | 130 | ### Obligatory tests 131 | ``` 132 | python setup.py test 133 | ``` 134 | 135 | 136 | --- 137 | 138 | ## [tl;dr](https://github.com/rodricios/autocomplete/blob/master/autocomplete/models.py) 139 | 140 | The following code excerpt is my interpretation of a series of lessons/concepts expressed in a number of different books. 141 | 142 | The unifying concept can be said to be [conditional probability](http://en.wikipedia.org/wiki/Conditional_probability): 143 | 144 | P(A , B) = P(B | A) * P(A) 145 | 146 | Which can read as saying: 147 | 148 | The probability of A and B occuring is equal to the probability of B occuring, given that A has occured 149 | 150 | More on this below. 151 | 152 | ```python 153 | 154 | # "preperation" step 155 | # for every word in corpus, normalize ('The' -> 'the'), insert to list 156 | WORDS = helpers.re_split(corpus) 157 | 158 | # first model -> P(word) 159 | # Counter constructor will take a list of elements and create a frequency distribution (histogram) 160 | WORDS_MODEL = collections.Counter(WORDS) 161 | 162 | # another preperation step 163 | # [a,b,c,d] -> [[a,b], [b,c], [c,d]] 164 | WORD_TUPLES = list(helpers.chunks(WORDS, 2)) 165 | 166 | # second model -> P(next word | prev. word) 167 | # I interpret "..| prev. word)" as saying "dictionary key 168 | # leading to seperate and smaller (than WORDS_MODEL) freq. dist. 169 | WORD_TUPLES_MODEL = {first:collections.Counter() for first, second in WORD_TUPLES} 170 | 171 | for prev_word, next_word in WORD_TUPLES: 172 | # this is called the "conditioning" step where we assert 173 | # that the probability space of all possible "next_word"'s 174 | # is "conditioned" under the event that "prev_word" has occurred 175 | WORD_TUPLES_MODEL[prev_word].update([next_word]) 176 | 177 | ``` 178 | 179 | Textbooks, and locations therein, where the concept-in-practice has been expressed: 180 | 181 | I. [Intro to Statistical Natural Language Processing](http://ics.upjs.sk/~pero/web/documents/pillar/Manning_Schuetze_StatisticalNLP.pdf) - Manning, Schütze, 1999 182 | 183 | a. frequency distribution showing the most common words and frequencies in *Tom Sawyer*, pg. 21 184 | 185 | b. conditional probability definition expressed in page 42 - section 2.1.2 186 | 187 | c. the intuition for *frequency* distributions found in pg. 153 (provided in the context of finding [*Collocations*](http://en.wikipedia.org/wiki/Collocation)) 188 | 189 | II. [Probabilistic Graphical Models](http://mitpress.mit.edu/books/probabilistic-graphical-models) - Kohler, Friedman, 2009 190 | 191 | a. conditional probability definition found on pg. 18 (hilariously and coincidentally found in section 2.1.2.1) 192 | 193 | III. [Artificial Intelligence - A Modern Approach](http://aima.cs.berkeley.edu) - Russell, Norvig, 3rd. ed. 2010 194 | 195 | a. conditional probability concept explained in pg. 485 196 | 197 | b. the "language" (I take to mean "intuition" for asserting things in the probabilistic sense) pg. 486 198 | 199 | c. the notion of "conditioning" found in pg. 492-494 200 | 201 | ## Motivation 202 | 203 | Similar to the motivation behind [eatiht](https://github.com/rodricios/eatiht#motivation), I found that it took far too long to find a palpable theory-to-application example of what amounts to more than a 500 pages of words across 3 books, each spanning a large index of, in certain cases, *counter-intuitive* nomenclature; read the [light criticisms](http://www.reddit.com/r/MachineLearning/comments/2fxi6v/ama_michael_i_jordan/ckep3z6) made by Michael I. Jordan on the matter (he was recently named [#2 machine learning expert "we need to know" on dataconomy.com](http://dataconomy.com/10-machine-learning-experts-you-need-to-know/)). 204 | 205 | You can find similar thoughts being expressed [**in an article from 2008 (updated 2009)**](http://brenocon.com/blog/2008/12/statistics-vs-machine-learning-fight/) by [Brennan O'Connor](http://brenocon.com) 206 | 207 | --- 208 | 209 | [*This work is dedicated to my siblings*](#note-1). 210 | 211 | ## Explain like I'm 5[*](#note-1) 212 | 213 | *Warning! This explanation is literally intended for young kids - I'm actually trying to see if these concepts can be explained to an audience unaware of the nomenclature used within the statistical [nlp](http://en.wikipedia.org/wiki/Natural_language_processing) and other machine learning fields. For example, my 7, 9, 11, 14 y.o. siblings, and basically anyone else who's ever read a story to a child - they would be a part of the target audience. 214 | 215 | If you've found this readable and informative, please consider putting on the goofiest face and reading this to your kids, if you have any :) If you do, please send me your thoughts on the experience. 216 | 217 | I'm only interested in lowering the barrier to entry. I should have included this note since the beginning (sorry to those who undoubtedly left with a bad taste in their mouth). 218 | 219 | You can contact me at rodrigopala91@gmail.com 220 | 221 | Thanks for reading, 222 | 223 | Rodrigo 224 | 225 | ## ELI5 226 | 227 | No. I'm explaining this like you're 5. I know you're not *5* , *you guys... Chris, stop jumping on your sister's back*! 228 | 229 | Ok, so I'm saying, *imagine I'm 5!* 230 | 231 | Oh, that was easy now huh? Let's just forget the *I'm 5* part. 232 | 233 | Imagine a giant collection of books. 234 | 235 | For example, all the Harry Potter and Hunger Games novels put together. 236 | 237 | What if I asked you to go through all the pages and all the words in those pages? 238 | 239 | Now I'm not asking you *four* to actually *read* the books. You know, just go through, beginning to end, and notice each word. 240 | 241 | For every new word you see, write it down, and put a "1" next to it, and everytime you see a word *again*, add "1" more to the previous number. 242 | 243 | So basically I'm asking y'all to keep count of how many times a word comes up. 244 | 245 | Got it? If yes, cool! If not, find a sibling, friend, or adult near you and ask them to help you out :) 246 | 247 | ... 248 | 249 | Say you start with *Harry Potter and the Sorcerer's Stone*: 250 | 251 | Mr. and Mrs. Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much... 252 | 253 | And imagine that you're on the 5th word. This or something close to this is what you're going for: 254 | 255 | Mr. -> 1 256 | and -> 1 257 | Mrs. -> 1 258 | Dursley -> 1 259 | of -> 1 260 | 261 | 262 | Or if you're a *wannabe-Harry-Potter* fan, ah I'm just kidding! 263 | 264 | If you started with *the-book-that-must-not-be-named* - I know you guys won't get it, but persons my age will :) 265 | 266 | Alright! So you started with *The Hunger Games*: 267 | 268 | When I wake up, the other side of the bed is cold... 269 | 270 | By the sixth word you have: 271 | 272 | When -> 1 273 | I -> 1 274 | wake -> 1 275 | up -> 1 276 | the -> 1 277 | 278 | You have a long day ahead of you... 279 | 280 | ... 281 | 282 | *1,105,285 words later* 283 | 284 | Now that you're done tallying up all those words, why not order all these words by the *number of times you've seen them*? 285 | 286 | See you next week! 287 | 288 | ... 289 | 290 | Back so soon? You should have gotten something like this: 291 | 292 | psst*, remember, the format is: 293 | word -> # of times the word appears 294 | 295 | 'the' -> 80030 296 | 'of' -> 40025 297 | 'and' -> 38313 298 | 'to' -> 28766 299 | 'in' -> 22050 300 | 'a' -> 21155 301 | 'that'-> 12512 302 | 'he' -> 12401 303 | 'was' -> 11410 304 | 'it' -> 10681 305 | ... there's a lot more words you've tallied up... 306 | 307 | 308 | Those were the most common words. 309 | 310 | Now on the *less-frequent* end, you'll find your words appearing not as often... 311 | 312 | ... 29137 words later. 313 | 'przazdziecka' -> 1 314 | 'disclosure' -> 1 315 | 'galvanism' -> 1 316 | 'repertoire' -> 1 317 | 'bravado' -> 1 318 | 'gal' -> 1 319 | 'ideological' -> 1 320 | 'guaiacol' -> 1 321 | 'expands' -> 1 322 | 'revolvers' -> 1 323 | 324 | Yeah Chris? Oh, 'what does *lez freekend*' mean? Um, so it means something like: *you probably won't hear or read that word very often.* 325 | 326 | Now what if I asked you to help me find this word I'm looking for? And I know this word starts with the letters: 'th'. 327 | 328 | I'm pretty sure you guys can do this much faster! 329 | 330 | ... 331 | 332 | *5 minutes later!* 333 | 334 | ... 335 | 336 | Not bad! You only had to go through 29157 unique words after all! 337 | 338 | 339 | 'the' -> 80030 340 | 'that' -> 12512 341 | 'this' -> 4063 342 | 'they' -> 3938 343 | 'there'-> 2972 344 | 'their'-> 2955 345 | 'them' -> 2241 346 | 'then' -> 1558 347 | 'these'-> 1231 348 | 'than' -> 1206 349 | ... 229 words more... 350 | 351 | 352 | 239 words, still kind of lot though huh? And you know your big brother, he's too lazy to do this work *by hand* (*cough* program it up *cough*) ;) 353 | 354 | So the word I'm looking for is on the tip of my tongue. I think the next letter is "i". 355 | 356 | *1 minute later* 357 | 358 | 'this' -> 4063 359 | 'think' -> 557 360 | 'things' -> 321 361 | 'thing' -> 303 362 | 'third' -> 239 363 | 'thin' -> 166 364 | 'thinking' -> 137 365 | 'thirty' -> 123 366 | 'thick' -> 77 367 | 'thirds' -> 43 368 | ... 36 words more... 369 | 370 | 371 | *I scan through the first 10 words.* Oh, I just remembered that the next letter is 'r'. 372 | 373 | *You start taking out even more words.* 374 | 375 | *10 seconds later.* 376 | 377 | 'third' -> 239 378 | 'thirty' -> 123 379 | 'thirds' -> 43 380 | 'thirteen' -> 32 381 | 'thirst' -> 13 382 | 'thirteenth' -> 11 383 | 'thirdly' -> 8 384 | 'thirsty' -> 5 385 | 'thirtieth' -> 3 386 | 'thirties' -> 2 387 | 388 | Aha, 'thirdly' was the word I was looking for! What, you never heard of the word "thirdly" before? 389 | 390 | Now you might be saying to yourself, "*that's pretty cool!*", and you're right! 391 | 392 | And you know what's cooler? *Making everyone's life a tiny bit easier* is! :) 393 | 394 | But how can you do that with just *words*? 395 | 396 | Aren't words boring and dull? 397 | 398 | It's like all we do is talk, write, and think with *words*. I mean, how lame, I can't even describe to you this *autocomplete* thing-slash-idea-thing without having to write it out with *words*! 399 | 400 | Ugh! I hate words! 401 | 402 | *Whoah, wait a minute! That was not cool of me! Let's relax for a minute.* 403 | 404 | Let's try to give an imaginary hug to the word-factory in our brains. That part of our brain works so hard, even when we don't ask it to. How nice of our brain to do that. Not! 405 | 406 | What I'm trying to say is that sometimes it's not very nice for our brains to distract us, especially when we have homework or other, real-world problems like adult-homework. 407 | 408 | ... 409 | 410 | So how about this: 411 | 412 | As a mental exercise, let's just try to think about *what* the next sentence coming out of our own mouths *will be*[\*](#note-2). 413 | 414 | Now if you're thinking about what will be coming out of my mouth, or out of your mouth, or your mouth, or your mouth, or your mouth, you're doing it wrong! (to readers who aren't one of my 4 younger siblings, that's how many I have). 415 | 416 | Try your best to think about *what* the next sentence coming out of *your own* mouth will be. 417 | 418 | ... 419 | 420 | Did you decide on your sentence? Good! 421 | 422 | Now what if I asked you to give me two **good** reasons explaining *why* and *how* you chose the sentence you chose? 423 | 424 | Wait, I can't even do that! Let's make it easier on ourselves. Let's try to only answer *why* and *how* we chose just the first word. 425 | 426 | Still pretty hard huh? 427 | 428 | If you thought it was pretty darn hard to give a *good and honest* reason as to why it is you chose the word you chose, it's alright. :) 429 | 430 | But like all couch-scientists, let's just make a guess! My guess is: our brain is a **probabilistic machine**. 431 | 432 | If you feel like you don't *get* what the word "probabilisitic" or "probability" means, sure you do! Just use the word "probably" in one of your sentences, but try to make some sense. 433 | 434 | Ok, so what do I mean? Well, let's just consider the English language. Like most other things, the English language has rules. 435 | 436 | The kind of rules that can be simplified down to: 437 | 438 | 1) "***something*** *action* ***something***". 439 | 440 | 2) Replace ***something***'s and ***action*** with words that make sense to you. 441 | 442 | Fair enough, right? 443 | 444 | Now imagine that your brain essentially has those rules "branded" or "recorded" into itself. Ok, so now I'm starting to not make much sense huh? 445 | 446 | How about this? How many times have you heard, 447 | 448 | "**Do** your **bed**!" 449 | 450 | "**Brush** your **teeth**!" 451 | 452 | "**Let's** get **food**!" 453 | 454 | While each one of you guys may have not heard those *exact* sentences, what I'm trying to say makes sense right? *That you probably heard certain sentences more often than others?* 455 | 456 | ... 457 | 458 | Now, imagine you could put "pause" right after the first word that comes out of your mouth. 459 | 460 | Let's just say that first word is "the". 461 | 462 | Now in the case that you stuttered for reasons outside your conscientious control (for example: "thhh thhe the"). No big deal, you meant to say "the", so let's *flatten* it to just that! 463 | 464 | With that *word* said, what words do you *think* you might have said after it? 465 | 466 | You might tell me, "*any word I want!* 467 | 468 | Of course you could have! I bet you spent a millisecond thinking about whether or not the next word you were going to say was going to be: *guaiacol*. 469 | 470 | I *know* because I thought about using that word too! 471 | 472 | I can remember the first time I heard (or read) *guaiacol* like it was yesterday. I read it in some funky article on the internet. I found the word in a list of words that don't appear too often in the English language. 473 | 474 | After I read it, I was able to fit *guaiacol* nicely into that part of my brain where I... uhh.. was... able... uhh... 475 | 476 | Oh, you *know*, that place in my brain where I get to choose whether I want to say *the apple*, *the automobile*, *the austronaut*, etc. 477 | 478 | ... 479 | 480 | Ok, so clearly I'm no brainician, and that may or may not be the way our brain works. 481 | 482 | But even though that idea might be wrong, the idea itself sounds like a pretty darn good way of suggesting the next word or words somebody is trying to *type*. 483 | 484 | What if you had a way to count the number of times you've heard "apple" said after the word "the"? 485 | 486 | Ask yourself the same question, but now with the word "automobile" instead of "apple". 487 | 488 | What if you had the time to think about every possible word that you've ever heard spoken after the word "the"? I'd say it might have looked something like this: 489 | 490 | Words you might have heard following the word "the" and the number of times you might have heard it 491 | 492 | 'same' -> 996 493 | 'french' -> 688 494 | 'first' -> 652 495 | 'old' -> 591 496 | 'emperor' -> 581 497 | 'other' -> 528 498 | 'whole' -> 500 499 | 'united' -> 466 500 | 'room' -> 376 501 | 'most' -> 373 502 | 503 | ... 9331 more words... 504 | 505 | Not impressed with your brain yet? Let's continue this little thought experiment further. 506 | 507 | Imagine that you just said "the", and you could put pause after the first *letter* of the next word out of your mouth: "h". 508 | 509 | Real quick, think of the shortest amount of time you can think of. Think of the shortest *second* you can think of. Now shorter than that too. 510 | 511 | At this point, you can't even call that length of time a *second*. But in that length of time, your brain may have just done this: 512 | 513 | Every word you've ever heard coming after the word "the": 514 | 515 | 'house' -> 284 516 | 'head' -> 117 517 | 'hands' -> 101 518 | 'hand' -> 97 519 | 'horses' -> 71 520 | 'hill' -> 64 521 | 'highest' -> 64 522 | 'high' -> 57 523 | 'history' -> 56 524 | 'heart' -> 55 525 | 526 | And that brain you got did this realllllyyyyyy fast. Faster than Google, Bing, Yahoo and any other company can ever hope to beat. And your brain did this without even asking for your permission. I think our brains are trying to control us you guys, oh no! 527 | 528 | ... 529 | 530 | Thanks for reading this far folks. Please go to the [afterword](https://github.com/rodricios/autocomplete#afterword) for some of the resources I've found useful in both building the intuition, and writing this article. 531 | 532 | Also, if it's not too much to ask, consider following me or tweeting this to your friends and/or family, any support is appreciated :) 533 | 534 | - [@rodricios](https://twitter.com/rodricios) 535 | 536 | ## If you're not 5 537 | 538 | The basic idea is this: 539 | 540 | Assume you have a large collection of English-understandable text merged into a single string. 541 | 542 | Start by transforming that string into a list of words (AKA *ngrams of word-legth*), and also (but not required) normalize each word ('The' -> 'the'). 543 | 544 | Once you have a normalized list of words, you can start building a frequency distribution measuring the frequency of each word. 545 | 546 | ... 547 | 548 | At this point you can start "predict" the "final state" of a word-in-progress. But consider the case where a user types in some query box: 549 | 550 | "The th" 551 | 552 | And he intends to write: 553 | 554 | "The third" 555 | 556 | With the above predictive model, you'll be suggesting something like: 557 | 558 | [ 559 | ('the', 80030), 560 | ('they', 3938), 561 | ('there', 2972), 562 | ... 563 | ] 564 | 565 | This explains one specific type of predictive model, which can be written as P(word), and you've just seen the pitfalls of using **just** this model. 566 | 567 | Now for the next word, ask yourself, what's the probability that I'm going to type the word "apple" given that I wrote "tasty"? 568 | 569 | In machine learning and AI books, you'll be presented *Conditional Probability* with the following equation: 570 | 571 | P(word A and word B) = P(word B | word A) * P(word A) 572 | 573 | That equation addresses the problem that I mentioned. 574 | 575 | We've handled P(wordA) already. 576 | 577 | To handle P(word B | word A), which reads *probability of word A given word B *, I take a *literall* interpretation of the word "given", in that context, to mean the following: 578 | 579 | *"word A" is the key pointing to a probability distribution representing all the words that follow "word A"* 580 | 581 | Once we can represent this second model, we can also apply the *filtering* step - given that we know more letters in the second word, we can zone in on more precise suggestions. 582 | 583 | --- 584 | 585 | ### Afterword 586 | 587 | notes: \*I have to give a shout out to [Sam Harris](https://twitter.com/SamHarrisOrg) for being, AFAIK, the first person or one of the firsts, in [wonderfully putting into words](https://www.youtube.com/watch?v=pCofmZlC72g#t=1144) what I've borrowed and slightly adapted for this writing. [I highly recommend his work](http://www.samharris.org/) 588 | 589 | Another shoutout to [Peter Norvig](http://norvig.com) for inspiring me and probably many others with our own little "toy" programs. His *Occam's Razor* approach to problem solving will likely cause some confusion as it may appear that my work is an almost full on copy-paste of his [*How to Write a Spell Checker*](http://norvig.com/spell-correct.html)! 590 | 591 | But I swear it's not! I actually I think I may have out-Norvig'ed Peter Norvig when it comes to describing [conditional probability](http://en.wikipedia.org/wiki/Conditional_probability): P(wordA & wordB) = P(wordB | wordA)\*P(wordA) 592 | 593 | And another one to Rob Renaud's [Gibberish Detector](https://github.com/rrenaud/Gibberish-Detector). I, out of pure chance, ran into his project some time after running into Norvig's article. I can't describe *how much it helped* to intuitively understand what the heavy hitters of "AI" consider to be introductory material; this was greatly needed b/c at the time, I felt overwhelmed by my own desire to really understand this area, and everything else going on. 594 | 595 | I do have a second article about this exact thing, only expressed differently (audience is non-programming), and it may or may not be posted soon! ~~Oh and the code too, that is if someone hasn't gotten to translating the above article to code before I can get to uploading the project :P I'm trying to get the kinks out of here and the code so it's simple, duh!~~ 596 | 597 | I dedicate this work to my sisters, Cat, Melissa and Christine, and my favorite brother, Christian :) 598 | 599 | #### note 1 600 | 601 | [go back](#explain-like-im-5) 602 | 603 | *To avoid confusion, I wrote this section in the form of a letter to my younger siblings* 604 | 605 | #### note 2 606 | 607 | *I'm borrowing, what I consider, [one of the most beautiful thought experiments I've ever heard trying to describe one's self](https://www.youtube.com/watch?v=pCofmZlC72g#t=1144). I'm a big fan of Sam Harris's work. Highly recommend! 608 | --------------------------------------------------------------------------------