├── autocomplete
    ├── tests
    │   ├── __init__.py
    │   └── test_autocomplete.py
    ├── helpers.py
    ├── __init__.py
    ├── autocomplete.py
    └── models.py
├── MANIFEST.in
├── setup.cfg
├── bin
    └── autocomplete_server.py
├── .gitignore
├── setup.py
├── README.rst
└── README.md


/autocomplete/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include autocomplete/big.txt
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
3 | 


--------------------------------------------------------------------------------
/bin/autocomplete_server.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | import autocomplete
4 | 
5 | autocomplete.run_server()
6 | 


--------------------------------------------------------------------------------
/autocomplete/helpers.py:
--------------------------------------------------------------------------------
 1 | """This file contains a collection of useful and concise functions gathered
 2 | from across the Web"""
 3 | 
 4 | import re
 5 | 
 6 | def norm_rsplit(text,n): return text.lower().rsplit(' ', n)[-n:]
 7 | 
 8 | #http://norvig.com/spell-correct.html
 9 | def re_split(text): return re.findall('[a-z]+', text.lower())
10 | 
11 | #http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
12 | #https://github.com/rrenaud/Gibberish-Detector/blob/master/gib_detect_train.py#L16
13 | def chunks(l, n):
14 |     for i in range(0, len(l) - n + 1):
15 |         yield l[i:i+n]
16 | 


--------------------------------------------------------------------------------
/autocomplete/tests/test_autocomplete.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | 
 5 | 
 6 | class TestLoadWordsModel(TestCase):
 7 |     def test_load_models(self):
 8 |         import autocomplete
 9 | 
10 |         is_loaded = autocomplete.load()
11 |         self.assertTrue(is_loaded)
12 | 
13 | 
14 |     def test_WORDS_MODEL_not_loaded(self):
15 |         from collections import Counter
16 | 
17 |         from autocomplete import models
18 | 
19 |         self.assertFalse(len(models.WORDS_MODEL.keys()) > 0)
20 | 
21 |     def test_WORD_PAIRS_MODEL_not_loaded(self):
22 |         from autocomplete import models
23 | 
24 |         self.assertFalse(len(models.WORD_TUPLES_MODEL.keys()) > 0)
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Pickle'd python objects
 9 | .pkl
10 | 
11 | 
12 | # Distribution / packaging
13 | .Python
14 | env/
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | 
29 | # PyInstaller
30 | #  Usually these files are written by a python script from a template
31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | 
47 | # Translations
48 | *.mo
49 | *.pot
50 | 
51 | # Django stuff:
52 | *.log
53 | 
54 | # Sphinx documentation
55 | docs/_build/
56 | 
57 | # PyBuilder
58 | target/
59 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | def readme():
 4 |     with open('README.rst') as f:
 5 |         return f.read()
 6 | 
 7 | setup(name='autocomplete',
 8 |       version='0.0.104',
 9 |       description='tiny \'autocomplete\' tool using a "hidden markov model"',
10 |       keywords='autocomplete autosuggest suggest complete spell spellsuggest \
11 |                 hidden markov model HMM hmm markov chain iPhone iphone suggest \
12 |                 Google suggest search as you type searchsuggest type spell \
13 |                 automatic spelling word suggest machine learning ai text \
14 |                 conditional probability model probabilistic perspective \
15 |                 Rodrigo Palacios rodrigo palacios im-rodrigo im_rodrigo \
16 |                 rodricios',
17 | 
18 |       author='Rodrigo Palacios',
19 |       author_email='rodrigopala91@gmail.com',
20 |       license='MIT',
21 |       packages=['autocomplete'],
22 |       install_requires=['bottle'],
23 |       url='https://github.com/rodricios/autocomplete',
24 |       scripts=['bin/autocomplete_server.py'],
25 |       package_data={'autocomplete': ['autocomplete/big.txt']},
26 |       test_suite='nose.collector',
27 |       tests_require=['nose'],
28 |       include_package_data=True,
29 |       zip_safe=False)
30 | 


--------------------------------------------------------------------------------
/autocomplete/__init__.py:
--------------------------------------------------------------------------------
 1 | """autocomplete - or How to "suggest" the completion of an unfinished word
 2 | using a simple conditional probability model.
 3 | 
 4 | written by Rodrigo Palacios
 5 | rodrigopala91@gmail.com
 6 | 
 7 | find me on GitHub or twitter:
 8 | http://github.com/rodricios
 9 | http://twitter.com/rodricios
10 | - Copyright 2015
11 | 
12 | Notes:
13 | 
14 | There are two works that have greatly inspired this and my last Python modules.
15 | 
16 | The first work is by Peter Norvig, a Director of Research @ Google (according
17 | to his wiki page):
18 | 
19 | How to Write a Spelling Corrector:
20 | http://norvig.com/spell-correct.html
21 | 
22 | I also suggest watching his lecture The Unreasonable Effectiveness of Data:
23 | https://www.youtube.com/watch?v=yvDCzhbjYWs
24 | 
25 | The second is by Rob Renaud who states (in his project's README) that he also
26 | felt inspired and challenged by Peter Norvig's lecture.
27 | 
28 | rrenaud's Gibberish-Detector:
29 | https://github.com/rrenaud/Gibberish-Detector
30 | 
31 | Finally, the implied challenge issued by Norvig is to try to come up with a
32 | simple solution to some problem using lots of data. He [probabilistically]
33 | solved the spell-checker problem by using text he found within his computer (not
34 | pulled from the internet). This data is contained within big.txt (6mb). I borrow
35 | this corpus, as did Renaud; you will likely see a lot of similarities between
36 | mine, Renaud's, and Norvig's Python projects. That's the point. Please feel
37 | free to send me any questions and comments to my email: rodrigopala91@gmail.com
38 | 
39 | Cheers,
40 | Rodrigo
41 | """
42 | 
43 | from bottle import route, run, debug
44 | 
45 | from autocomplete import models
46 | 
47 | from .autocomplete import predict
48 | 
49 | def run_server(port_num=8080):
50 |     """little demo server for demo'ing sake"""
51 |     models.load_models()
52 | 
53 |     debug(True)
54 | 
55 |     @route('/<first_word>/<second_word>')
56 |     def index(first_word, second_word):
57 |         return dict(predict(first_word, second_word))
58 | 
59 |     run(host='localhost', port=port_num)
60 | 
61 | 
62 | def load():
63 |     """load the classic Norvig big.txt corpus"""
64 |     print("training!")
65 | 
66 |     models.load_models()
67 | 
68 |     print("done training!")
69 | 
70 |     return True
71 | 
72 | 


--------------------------------------------------------------------------------
/autocomplete/autocomplete.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | from . import models
 4 | 
 5 | from . import helpers
 6 | 
 7 | #the so called "Hidden" step, thus allowing this module to be
 8 | #a "Hidden Markov Model"... Whatever that means...
 9 | NEARBY_KEYS = {
10 |     'a': 'qwsz',
11 |     'b': 'vghn',
12 |     'c': 'xdfv',
13 |     'd': 'erfcxs',
14 |     'e': 'rdsw',
15 |     'f': 'rtgvcd',
16 |     'g': 'tyhbvf',
17 |     'h': 'yujnbg',
18 |     'j': 'uikmnh',
19 |     'k': 'iolmj',
20 |     'l': 'opk',
21 |     'm': 'njk',
22 |     'n': 'bhjm',
23 |     'o': 'iklp',
24 |     'p': 'ol',
25 |     'q': 'wa',
26 |     'r': 'edft',
27 |     's': 'wedxza',
28 |     't': 'rfgy',
29 |     'u': 'yhji',
30 |     'v': 'cfgb',
31 |     'w': 'qase',
32 |     'x': 'zsdc',
33 |     'y': 'tghu',
34 |     'z': 'asx'
35 |     }
36 | 
37 | 
38 | def this_word(word, top_n=10):
39 |     """given an incomplete word, return top n suggestions based off
40 |     frequency of words prefixed by said input word"""
41 |     try:
42 |         return [(k, v) for k, v in models.WORDS_MODEL.most_common()
43 |                 if k.startswith(word)][:top_n]
44 |     except KeyError:
45 |         raise Exception("Please load predictive models. Run:\
46 |                         \n\tautocomplete.load()")
47 | 
48 | 
49 | predict_currword = this_word
50 | 
51 | 
52 | def this_word_given_last(first_word, second_word, top_n=10):
53 |     """given a word, return top n suggestions determined by the frequency of
54 |     words prefixed by the input GIVEN the occurence of the last word"""
55 | 
56 |     #Hidden step
57 |     possible_second_words = [second_word[:-1]+char
58 |                              for char in NEARBY_KEYS[second_word[-1]]
59 |                              if len(second_word) > 2]
60 | 
61 |     possible_second_words.append(second_word)
62 | 
63 |     probable_words = {w:c for w, c in
64 |                       models.WORD_TUPLES_MODEL[first_word.lower()].items()
65 |                       for sec_word in possible_second_words
66 |                       if w.startswith(sec_word)}
67 | 
68 |     return Counter(probable_words).most_common(top_n)
69 | 
70 | 
71 | predict_currword_given_lastword = this_word_given_last
72 | 
73 | 
74 | def predict(first_word, second_word, top_n=10):
75 |     """given the last word and the current word to complete, we call
76 |     predict_currword or predict_currword_given_lastword to retrive most n
77 |     probable suggestions.
78 |     """
79 | 
80 |     try:
81 |         if first_word and second_word:
82 |             return predict_currword_given_lastword(first_word,
83 |                                                    second_word,
84 |                                                    top_n=top_n)
85 |         else:
86 |             return predict_currword(first_word, top_n)
87 |     except KeyError:
88 |         raise Exception("Please load predictive models. Run:\
89 |                         \n\tautocomplete.load()")
90 | 
91 | 
92 | def split_predict(text, top_n=10):
93 |     """takes in string and will right split accordingly.
94 |     Optionally, you can provide keyword argument "top_n" for
95 |     choosing the number of suggestions to return (default is 10)"""
96 |     text = helpers.norm_rsplit(text, 2)
97 |     return predict(*text, top_n=top_n)
98 | 


--------------------------------------------------------------------------------
/autocomplete/models.py:
--------------------------------------------------------------------------------
  1 | """AUTOCOMPLETE -
  2 | This file contains the process where we train our predictive models, Also
  3 | helpful are the load_models and save_models functions.
  4 | """
  5 | 
  6 | import os
  7 | 
  8 | import collections
  9 | 
 10 | import pickle
 11 | 
 12 | from . import helpers
 13 | 
 14 | WORDS = []
 15 | 
 16 | WORD_TUPLES = []
 17 | 
 18 | WORDS_MODEL = {}
 19 | 
 20 | WORD_TUPLES_MODEL = {}
 21 | 
 22 | #This step is where we transform "raw" data
 23 | # into some sort of probabilistic model(s)
 24 | def train_models(corpus, model_name="models_compressed.pkl"):
 25 |     """Takes in a preferably long string (corpus/training data),
 26 |     split that string into a list, we \"chunkify\" resulting in
 27 |     a list of 2-elem list. Finally we create a dictionary,
 28 |     where each key = first elem and each value = Counter([second elems])
 29 | 
 30 |     Will save/pickle model by default ('models_compressed.pkl').
 31 |     Set second argument to false if you wish to not save the models.
 32 |     """
 33 | 
 34 |     # "preperation" step
 35 |     # word is in WORDS
 36 |     global WORDS
 37 |     WORDS = helpers.re_split(corpus)
 38 | 
 39 |     # first model -> P(word)
 40 |     global WORDS_MODEL
 41 |     WORDS_MODEL = collections.Counter(WORDS)
 42 | 
 43 |     # another preperation step
 44 |     # wordA, wordB are in WORDS
 45 |     global WORD_TUPLES
 46 |     WORD_TUPLES = list(helpers.chunks(WORDS, 2))
 47 | 
 48 |     # second model -> P(wordA|wordB)
 49 |     global WORD_TUPLES_MODEL
 50 |     WORD_TUPLES_MODEL = {first:collections.Counter()
 51 |                          for first, second in WORD_TUPLES}
 52 | 
 53 |     for tup in WORD_TUPLES:
 54 |         try:
 55 |             WORD_TUPLES_MODEL[tup[0]].update([tup[1]])
 56 |         except:
 57 |             # hack-y fix for uneven # of elements in WORD_TUPLES
 58 |             pass
 59 | 
 60 |     if model_name:
 61 |         save_models(os.path.join(os.path.dirname(__file__), model_name))
 62 | 
 63 | 
 64 | def train_bigtxt():
 65 |     """unnecessary helper function for training against
 66 |     default corpus data (big.txt)"""
 67 | 
 68 |     bigtxtpath = os.path.join(os.path.dirname(__file__), 'big.txt')
 69 |     with open(bigtxtpath, 'rb') as bigtxtfile:
 70 | 
 71 |         train_models(str(bigtxtfile.read()))
 72 | 
 73 | 
 74 | def save_models(path=None):
 75 |     """Save models to 'path'. If 'path' not specified,
 76 |     save to module's folder under name 'models_compressed.pkl'"""
 77 | 
 78 |     if path == None:
 79 |         path = os.path.join(os.path.dirname(__file__), 'models_compressed.pkl')
 80 | 
 81 |     print("saving to:", path)
 82 |     #save for next use. pickle format: (key=model name, value=model)
 83 |     pickle.dump({'words_model': WORDS_MODEL,
 84 |                  'word_tuples_model': WORD_TUPLES_MODEL},
 85 |                 open(path, 'wb'),
 86 |                 protocol=2)
 87 | 
 88 | 
 89 | def load_models(load_path=None):
 90 |     """Load autocomplete's built-in model (uses Norvig's big.txt). Optionally
 91 |     provide the path to Python pickle object."""
 92 | 
 93 |     if load_path is None:
 94 |         load_path = os.path.join(os.path.dirname(__file__),
 95 |                                  'models_compressed.pkl')
 96 |     try:
 97 |         models = pickle.load(open(load_path,'rb'))
 98 | 
 99 |         global WORDS_MODEL
100 |         WORDS_MODEL = models['words_model']
101 | 
102 |         global WORD_TUPLES_MODEL
103 |         WORD_TUPLES_MODEL = models['word_tuples_model']
104 | 
105 |         print("successfully loaded: models_compressed.pkl")
106 |     except IOError:
107 |         print("Error in opening pickle object. Training on default corpus text.")
108 |         train_bigtxt()
109 |     except KeyError:
110 |         print("Error in loading both predictve models.\
111 |               Training on default corpus text.")
112 |         train_bigtxt()
113 |     except ValueError:
114 |         print("Corrupted pickle string.\
115 |               Training on default corpus text (big.txt)")
116 |         train_bigtxt()
117 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | *Autocomplete* or: How I learned to stop spelling and love our AI overlords
  2 | ===========================================================================
  3 | 
  4 | A practical guide to implementing "autocomplete"! It follows the
  5 | sometimes misunderstood principles of conditional probability
  6 | distributions and the generalized Hidden Markov Model (HMM).
  7 | 
  8 | Fun fact: Your iPhone's "autocomplete" was implemented using a HMM! Plus
  9 | the extra stuff it chose to `sue Samsung
 10 | for <http://www.cnet.com/news/us-patent-office-rejects-apple-autocomplete-patent-used-against-samsung/>`__.
 11 | 
 12 | Skip to:
 13 | --------
 14 | 
 15 | -  `How to's <#how-to-install>`__
 16 | -  `tl;dr? <#tldr>`__
 17 | -  `Motivation <#motivation>`__
 18 | -  `ELI5 <#explain-like-im-5>`__
 19 | -  `If you're not 5 <#if-youre-not-5>`__
 20 | 
 21 | --------------
 22 | 
 23 | How to install:
 24 | ---------------
 25 | 
 26 | ::
 27 | 
 28 |     pip install autocomplete
 29 | 
 30 | How to use:
 31 | -----------
 32 | 
 33 | .. code:: python
 34 | 
 35 |     import autocomplete
 36 | 
 37 |     # load pickled python Counter objects representing our predictive models
 38 |     # I use Peter Norvigs big.txt (http://norvig.com/big.txt) to create the predictive models
 39 |     autocomplete.load()
 40 | 
 41 |     # imagine writing "the b"
 42 |     autocomplete.predict('the','b')
 43 | 
 44 |     [('blood', 204),
 45 |      ('battle', 185),
 46 |      ('bone', 175),
 47 |      ('best', 149),
 48 |      ('body', 149),
 49 |      ...]
 50 | 
 51 |     # now you type an "o"
 52 | 
 53 |     autocomplete.predict('the','bo')
 54 | 
 55 |     [('bone', 175),
 56 |      ('body', 149),
 57 |      ('bones', 122),
 58 |      ('boy', 46),
 59 |      ('bottom', 32),
 60 |      ('box', 24),
 61 |      ...]
 62 | 
 63 | If you have your own language model in the form described in
 64 | `ELI5 <#explain-like-im-5>`__, then use the *models* submodule to call
 65 | the training method:
 66 | 
 67 | .. code:: python
 68 | 
 69 |     from autocomplete import models
 70 | 
 71 |     models.train_models('some giant string of text')
 72 | 
 73 | Want to run it as a server (bottlepy required)?
 74 | 
 75 | .. code:: python
 76 | 
 77 |     import autocomplete
 78 | 
 79 |     autocomplete.run_server()
 80 | 
 81 |     #output
 82 |     Bottle v0.12.8 server starting up (using WSGIRefServer())...
 83 |     Listening on http://localhost:8080/
 84 |     Hit Ctrl-C to quit.
 85 | 
 86 | Now head over to http://localhost:8080/the/bo
 87 | 
 88 | ::
 89 | 
 90 |     http://localhost:8080/the/bo
 91 |     #output
 92 |     {"body": 149, "box": 24, "bottom": 32, "boy": 46, "borzois": 16, "bodies": 13, "bottle": 13, "bones": 122, "book": 14, "bone": 175}
 93 | 
 94 |     http://localhost:8080/the/bos
 95 |     #output
 96 |     {"boscombe": 11, "boston": 7, "boss": 1, "bosom": 5, "bosses": 4}
 97 | 
 98 | Obligatory tests
 99 | ~~~~~~~~~~~~~~~~
100 | 
101 | ::
102 | 
103 |     python setup.py test
104 | 
105 | --------------
106 | 
107 | `tl;dr <https://github.com/rodricios/autocomplete/blob/master/autocomplete/models.py>`__
108 | ----------------------------------------------------------------------------------------
109 | 
110 | The following code excerpt is my interpretation of a series of
111 | lessons/concepts expressed in a number of different books.
112 | 
113 | The unifying concept can be said to be `conditional
114 | probability <http://en.wikipedia.org/wiki/Conditional_probability>`__:
115 | 
116 | ::
117 | 
118 |     P(A , B) = P(B | A) * P(A)
119 | 
120 | Which can read as saying:
121 | 
122 | ::
123 | 
124 |     The probability of A and B occuring is equal to the probability of B occuring, given that A has occured
125 | 
126 | More on this below.
127 | 
128 | .. code:: python
129 | 
130 | 
131 |         # "preperation" step
132 |         # for every word in corpus, normalize ('The' -> 'the'), insert to list
133 |         WORDS = helpers.re_split(corpus)
134 | 
135 |         # first model -> P(word)
136 |         # Counter constructor will take a list of elements and create a frequency distribution (histogram)
137 |         WORDS_MODEL = collections.Counter(WORDS)
138 | 
139 |         # another preperation step
140 |         # [a,b,c,d] -> [[a,b], [c,d]]
141 |         WORD_TUPLES = list(helpers.chunks(WORDS, 2))
142 | 
143 |         # second model -> P(next word | prev. word)
144 |         # I interpret "..| prev. word)" as saying "dictionary key
145 |         # leading to seperate and smaller (than WORDS_MODEL) freq. dist.
146 |         WORD_TUPLES_MODEL = {first:collections.Counter() for first, second in WORD_TUPLES}
147 | 
148 |         for prev_word, next_word in WORD_TUPLES:
149 |             # this is called the "conditioning" step where we assert
150 |             # that the probability space of all possible "next_word"'s
151 |             # is "conditioned" under the event that "prev_word" has occurred
152 |             WORD_TUPLES_MODEL[prev_word].update([next_word])
153 | 
154 | Textbooks, and locations therein, where the concept-in-practice has been
155 | expressed:
156 | 
157 | I. `Intro to Statistical Natural Language
158 | Processing <http://ics.upjs.sk/~pero/web/documents/pillar/Manning_Schuetze_StatisticalNLP.pdf>`__
159 | - Manning, Schütze, 1999
160 | 
161 | ::
162 | 
163 |     a. frequency distribution showing the most common words and frequencies in *Tom Sawyer*, pg. 21
164 | 
165 |     b. conditional probability definition expressed in page 42 - section 2.1.2
166 | 
167 |     c. the intuition for *frequency* distributions found in pg. 153 (provided in the context of finding [*Collocations*](http://en.wikipedia.org/wiki/Collocation))
168 | 
169 | II.  `Probabilistic Graphical
170 |      Models <http://mitpress.mit.edu/books/probabilistic-graphical-models>`__
171 |      - Kohler, Friedman, 2009
172 | 
173 |      a. conditional probability definition found on pg. 18 (hilariously
174 |         and coincidentally found in section 2.1.2.1)
175 | 
176 | III. `Artificial Intelligence - A Modern
177 |      Approach <http://aima.cs.berkeley.edu>`__ - Russell, Norvig, 3rd.
178 |      ed. 2010
179 | 
180 |      a. conditional probability concept explained in pg. 485
181 | 
182 |      b. the "language" (I take to mean "intuition" for asserting things
183 |         in the probabilistic sense) pg. 486
184 | 
185 |      c. the notion of "conditioning" found in pg. 492-494
186 | 
187 | Motivation
188 | ----------
189 | 
190 | Similar to the motivation behind
191 | `eatiht <https://github.com/rodricios/eatiht#motivation>`__, I found
192 | that it took far too long to find a palpable theory-to-application
193 | example of what amounts to more than a 500 pages of words across 3
194 | books, each spanning a large index of, in certain cases,
195 | *counter-intuitive* nomenclature; read the `light
196 | criticisms <http://www.reddit.com/r/MachineLearning/comments/2fxi6v/ama_michael_i_jordan/ckep3z6>`__
197 | made by Michael I. Jordan on the matter (he was recently named `#2
198 | machine learning expert "we need to know" on
199 | dataconomy.com <http://dataconomy.com/10-machine-learning-experts-you-need-to-know/>`__).
200 | 
201 | You can find similar thoughts being expressed `**in an article from 2008
202 | (updated
203 | 2009)** <http://brenocon.com/blog/2008/12/statistics-vs-machine-learning-fight/>`__
204 | by `Brennan O'Connor <http://brenocon.com>`__
205 | 
206 | --------------
207 | 
208 | `*This work is dedicated to my siblings* <#note-1>`__.
209 | 
210 | Explain like I'm 5\ `\* <#note-1>`__
211 | ------------------------------------
212 | 
213 | \*Warning! This explanation is literally intended for young kids - I'm
214 | actually trying to see if these concepts can be explained to an audience
215 | unaware of the nomenclature used within the statistical
216 | `nlp <http://en.wikipedia.org/wiki/Natural_language_processing>`__ and
217 | other machine learning fields. For example, my 7, 9, 11, 14 y.o.
218 | siblings, and basically anyone else who's ever read a story to a child -
219 | they would be a part of the target audience.
220 | 
221 | If you've found this readable and informative, please consider putting
222 | on the goofiest face and reading this to your kids, if you have any :)
223 | If you do, please send me your thoughts on the experience.
224 | 
225 | I'm only interested in lowering the barrier to entry. I should have
226 | included this note since the beginning (sorry to those who undoubtedly
227 | left with a bad taste in their mouths).
228 | 
229 | You can contact me at rodrigopala91@gmail.com
230 | 
231 | Thanks for reading,
232 | 
233 | Rodrigo
234 | 
235 | ELI5
236 | ----
237 | 
238 | No. I'm explaining this like you're 5. I know you're not *5* , *you
239 | guys... Chris, stop jumping on your sister's back*!
240 | 
241 | Ok, so I'm saying, *imagine I'm 5!*
242 | 
243 | Oh, that was easy now huh? Let's just forget the *I'm 5* part.
244 | 
245 | Imagine a giant collection of books.
246 | 
247 | For example, all the Harry Potter and Hunger Games novels put together.
248 | 
249 | What if I asked you to go through all the pages and all the words in
250 | those pages?
251 | 
252 | Now I'm not asking you *four* to actually *read* the books. You know,
253 | just go through, beginning to end, and notice each word.
254 | 
255 | For every new word you see, write it down, and put a "1" next to it, and
256 | everytime you see a word *again*, add "1" more to the previous number.
257 | 
258 | So basically I'm asking y'all to keep count of how many times a word
259 | comes up.
260 | 
261 | Got it? If yes, cool! If not, find a sibling, friend, or adult near you
262 | and ask them to help you out :)
263 | 
264 | ...
265 | 
266 | Say you start with *Harry Potter and the Sorcerer's Stone*:
267 | 
268 | ::
269 | 
270 |     Mr. and Mrs. Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much...
271 | 
272 | And imagine that you're on the 5th word. This or something close to this
273 | is what you're going for:
274 | 
275 | ::
276 | 
277 |     Mr.     -> 1
278 |     and     -> 1
279 |     Mrs.    -> 1
280 |     Dursley -> 1
281 |     of      -> 1
282 | 
283 | Or if you're a *wannabe-Harry-Potter* fan, ah I'm just kidding!
284 | 
285 | If you started with *the-book-that-must-not-be-named* - I know you guys
286 | won't get it, but persons my age will :)
287 | 
288 | Alright! So you started with *The Hunger Games*:
289 | 
290 | ::
291 | 
292 |     When I wake up, the other side of the bed is cold...
293 | 
294 | By the sixth word you have:
295 | 
296 | ::
297 | 
298 |     When  -> 1
299 |     I     -> 1
300 |     wake  -> 1
301 |     up    -> 1
302 |     the   -> 1
303 | 
304 | You have a long day ahead of you...
305 | 
306 | ...
307 | 
308 | *1,105,285 words later*
309 | 
310 | Now that you're done tallying up all those words, why not order all
311 | these words by the *number of times you've seen them*?
312 | 
313 | See you next week!
314 | 
315 | ...
316 | 
317 | Back so soon? You should have gotten something like this:
318 | 
319 | ::
320 | 
321 |     psst*, remember, the format is:
322 |      word -> # of times the word appears
323 | 
324 |     'the' -> 80030
325 |     'of'  -> 40025
326 |     'and' -> 38313
327 |     'to'  -> 28766
328 |     'in'  -> 22050
329 |     'a'   -> 21155
330 |     'that'-> 12512
331 |     'he'  -> 12401
332 |     'was' -> 11410
333 |     'it'  -> 10681
334 |     ... there's a lot more words you've tallied up...
335 | 
336 | Those were the most common words.
337 | 
338 | Now on the *less-frequent* end, you'll find your words appearing not as
339 | often...
340 | 
341 | ::
342 | 
343 |     ... 29137 words later.
344 |     'przazdziecka' -> 1
345 |     'disclosure'   -> 1
346 |     'galvanism'    -> 1
347 |     'repertoire'   -> 1
348 |     'bravado'      -> 1
349 |     'gal'          -> 1
350 |     'ideological'  -> 1
351 |     'guaiacol'     -> 1
352 |     'expands'      -> 1
353 |     'revolvers'    -> 1
354 | 
355 | Yeah Chris? Oh, 'what does *lez freekend*' mean? Um, so it means
356 | something like: *you probably won't hear or read that word very often.*
357 | 
358 | Now what if I asked you to help me find this word I'm looking for? And I
359 | know this word starts with the letters: 'th'.
360 | 
361 | I'm pretty sure you guys can do this much faster!
362 | 
363 | ...
364 | 
365 | *5 minutes later!*
366 | 
367 | ...
368 | 
369 | Not bad! You only had to go through 29157 unique words after all!
370 | 
371 | ::
372 | 
373 |     'the'  -> 80030
374 |     'that' -> 12512
375 |     'this' -> 4063
376 |     'they' -> 3938
377 |     'there'-> 2972
378 |     'their'-> 2955
379 |     'them' -> 2241
380 |     'then' -> 1558
381 |     'these'-> 1231
382 |     'than' -> 1206
383 |     ... 229 words more...
384 | 
385 | 239 words, still kind of lot though huh? And you know your big brother,
386 | he's too lazy to do this work *by hand* (*cough* program it up *cough*)
387 | ;)
388 | 
389 | So the word I'm looking for is on the tip of my tongue. I think the next
390 | letter is "i".
391 | 
392 | *1 minute later*
393 | 
394 | ::
395 | 
396 |     'this'     -> 4063
397 |     'think'    -> 557
398 |     'things'   -> 321
399 |     'thing'    -> 303
400 |     'third'    -> 239
401 |     'thin'     -> 166
402 |     'thinking' -> 137
403 |     'thirty'   -> 123
404 |     'thick'    -> 77
405 |     'thirds'   -> 43
406 |     ... 36 words more...
407 | 
408 | *I scan through the first 10 words.* Oh, I just remembered that the next
409 | letter is 'r'.
410 | 
411 | *You start taking out even more words.*
412 | 
413 | *10 seconds later.*
414 | 
415 | ::
416 | 
417 |     'third'      -> 239
418 |     'thirty'     -> 123
419 |     'thirds'     -> 43
420 |     'thirteen'   -> 32
421 |     'thirst'     -> 13
422 |     'thirteenth' -> 11
423 |     'thirdly'    -> 8
424 |     'thirsty'    -> 5
425 |     'thirtieth'  -> 3
426 |     'thirties'   -> 2
427 | 
428 | Aha, 'thirdly' was the word I was looking for! What, you never heard of
429 | the word "thirdly" before?
430 | 
431 | Now you might be saying to yourself, "*that's pretty cool!*\ ", and
432 | you're right!
433 | 
434 | And you know what's cooler? *Making everyone's life a tiny bit easier*
435 | is! :)
436 | 
437 | But how can you do that with just *words*?
438 | 
439 | Aren't words boring and dull?
440 | 
441 | It's like all we do is talk, write, and think with *words*. I mean, how
442 | lame, I can't even describe to you this *autocomplete*
443 | thing-slash-idea-thing without having to write it out with *words*!
444 | 
445 | Ugh! I hate words!
446 | 
447 | *Whoah, wait a minute! That was not cool of me! Let's relax for a
448 | minute.*
449 | 
450 | Let's try to give an imaginary hug to the word-factory in our brains.
451 | That part of our brain works so hard, even when we don't ask it to. How
452 | nice of our brain to do that. Not!
453 | 
454 | What I'm trying to is sometimes it's not so nice for our brains to
455 | distract us, especially when we have homework or other, real-world,
456 | problems like adult-homework.
457 | 
458 | So how about this: let's try to think about *what* the next sentence
459 | coming out of our own mouths *will be*\ `\* <#note-2>`__.
460 | 
461 | Now if you're thinking about what will be coming out of my mouth, or out
462 | of your mouth, or your mouth, or your mouth, or your mouth, you're doing
463 | it wrong! (to readers who aren't one of my 4 younger siblings, that's
464 | how many I have).
465 | 
466 | Try your best to think about *what* the next sentence coming out of
467 | *your own* mouth will be.
468 | 
469 | ...
470 | 
471 | Did you decide on your sentence? Good!
472 | 
473 | Now what if I asked you to give me two reasons explaining *why* and
474 | *how* you chose the sentence you chose?
475 | 
476 | Wait, I can't even do that! Let's make it easier on ourselves and
477 | explain *why* and *how* we chose the first *word*.
478 | 
479 | Still pretty hard huh?
480 | 
481 | If you thought about it, and you thought it was pretty darn hard to give
482 | a *good and honest* reason as to why it is you chose the word you chose,
483 | let's bring out a word you guys might not understand: *probability*.
484 | 
485 | If you feel like you don't *get* what the word means, sure you do! Just
486 | use the word "probably" in one of your sentences, but but try to makes
487 | some sense.
488 | 
489 | What do I mean? Well, let's just consider the English language. Like
490 | most other things, the English language has rules.
491 | 
492 | The kind of rules that can be simplified down to:
493 | 
494 | 1) "***something*** *action* ***something***".
495 | 
496 | 2) Replace ***something***'s and ***action*** with words that make sense
497 |    to you.
498 | 
499 | Fair enough, right?
500 | 
501 | Now, imagine you could put "pause" right after the first word that comes
502 | out of your mouth.
503 | 
504 | Let's just say that first word is "the".
505 | 
506 | Now in the case that you stuttered for reasons outside your
507 | conscientious control (for example: "thhh thhe the"). No big deal, you
508 | meant to say "the", so let's *flatten* it to just that!
509 | 
510 | With that *word* said, what words do you *think* you might have said
511 | after it?
512 | 
513 | You might tell me, "*any word I want!*
514 | 
515 | Of course you could have! I bet you spent a millisecond thinking about
516 | whether or not the next word you were going to say was going to be:
517 | *guaiacol*.
518 | 
519 | I *know* because I thought about using that word too!
520 | 
521 | I can remember the first time I heard (or read) *guaiacol* like it was
522 | yesterday. I read it in some funky article on the internet. I found the
523 | word in a list of words that don't appear too often in the English
524 | language.
525 | 
526 | After I read it, I was able to fit *guaiacol* nicely into that part of
527 | my brain where I... uhh.. was... able... uhh...
528 | 
529 | Oh, you *know*, that place in my brain where I get to choose whether I
530 | want to say *the apple*, *the automobile*, *the austronaut*, etc.
531 | 
532 | ...
533 | 
534 | Ok, so clearly I'm no brainician, and that may or may not be the way our
535 | brain works - actually, it's probably super super unlikely.
536 | 
537 | But even though that idea is probably wrong, the idea itself sounds like
538 | a pretty darn good way of suggesting the next word or words somebody is
539 | trying to *type*.
540 | 
541 | What if you had a way to count the number of times you've heard "apple"
542 | said after the word "the"?
543 | 
544 | Ask yourself the same question, but now with the word "automobile"
545 | instead of "apple".
546 | 
547 | What if you had the time to think about every possible word that you've
548 | ever heard spoken after the word "the"? I'd say it might have looked
549 | something like this:
550 | 
551 | ::
552 | 
553 |     Words you might have heard following the word "the" and the number of times you might have heard it
554 | 
555 |     'same'     -> 996
556 |     'french'   -> 688
557 |     'first'    -> 652
558 |     'old'      -> 591
559 |     'emperor'  -> 581
560 |     'other'    -> 528
561 |     'whole'    -> 500
562 |     'united'   -> 466
563 |     'room'     -> 376
564 |     'most'     -> 373
565 | 
566 |     ... 9331 more words...
567 | 
568 | Not impressed with your brain yet? Let's continue this little thought
569 | experiment further.
570 | 
571 | Imagine that you just said "the", and you could put pause after the
572 | first *letter* of the next word out of your mouth: "h".
573 | 
574 | Real quick, think of the shortest amount of time you can think of. Think
575 | of the shortest *second* you can think of. Now shorter than that too.
576 | 
577 | At this point, you can't even call that length of time a *second*. But
578 | in that length of time, your brain may have just done this:
579 | 
580 | ::
581 | 
582 |     Every word you've ever heard coming after the word "the":
583 | 
584 |     'house'   -> 284
585 |     'head'    -> 117
586 |     'hands'   -> 101
587 |     'hand'    -> 97
588 |     'horses'  -> 71
589 |     'hill'    -> 64
590 |     'highest' -> 64
591 |     'high'    -> 57
592 |     'history' -> 56
593 |     'heart'   -> 55
594 | 
595 | And that brain you got did this realllllyyyyyy fast. Faster than Google,
596 | Bing, Yahoo and any other company can ever hope to beat. And your brain
597 | did this without even asking for your permission. I think our brains are
598 | trying to control us you guys, oh no!
599 | 
600 | If you're not 5
601 | ---------------
602 | 
603 | The basic idea is this:
604 | 
605 | Assume you have a large collection of Enlish-understandable text merged
606 | into a single string.
607 | 
608 | Start by transforming that string into a list of words (AKA *ngrams of
609 | word-legth*), and also (but not required) normalize each word ('The' ->
610 | 'the').
611 | 
612 | Once you have a normalized list of words, you can start building a
613 | frequency distribution measuring the frequency of each word.
614 | 
615 | ...
616 | 
617 | At this point you can start "predict" the "final state" of a
618 | word-in-progress. But consider the case where a user types in some query
619 | box:
620 | 
621 | ::
622 | 
623 |     "The th"
624 | 
625 | And he intends to write:
626 | 
627 | ::
628 | 
629 |     "The third"
630 | 
631 | With the above predictive model, you'll be suggesting something like:
632 | 
633 | ::
634 | 
635 |     [
636 |         ('the', 80030),
637 |         ('they', 3938),
638 |         ('there', 2972),
639 |         ...
640 |     ]
641 | 
642 | This explains one specific type of predictive model, which can be
643 | written as P(word), and you've just seen the pitfalls of using **just**
644 | this model.
645 | 
646 | Now for the next word, ask yourself, what's the probability that I'm
647 | going to type the word "apple" given that I wrote "tasty"?
648 | 
649 | In machine learning and AI books, you'll be presented *Conditional
650 | Probability* with the following equation:
651 | 
652 | ::
653 | 
654 |     P(word A and word B) = P(word B | word A) * P(word A)
655 | 
656 | That equation addresses the problem that I mentioned.
657 | 
658 | We've handled P(wordA) already.
659 | 
660 | To handle P(word B \| word A), which reads *probability of word A given
661 | word B *, I take a *literall* interpretation of the word "given", in
662 | that context, to mean the following:
663 | 
664 | *"word A" is the key pointing to a probability distribution representing
665 | all the words that follow "word A"*
666 | 
667 | Once we can represent this second model, we can also apply the
668 | *filtering* step - given that we know more letters in the second word,
669 | we can zone in on more precise suggestions.
670 | 
671 | --------------
672 | 
673 | Afterword
674 | ~~~~~~~~~
675 | 
676 | notes: \*I have to give a shout out to `Sam
677 | Harris <https://twitter.com/SamHarrisOrg>`__ for being, AFAIK, the first
678 | person or one of the firsts, in `wonderfully putting into
679 | words <https://www.youtube.com/watch?v=pCofmZlC72g#t=1144>`__ what I've
680 | borrowed and slightly adapted for this writing. `I highly recommend his
681 | work <http://www.samharris.org/>`__
682 | 
683 | Another shoutout to `Peter Norvig <http://norvig.com>`__ for inspiring
684 | me and probably many others with our own little "toy" programs. His
685 | *Occam's Razor* approach to problem solving will likely cause some
686 | confusion as it may appear that my work is an almost full on copy-paste
687 | of his `*How to Write a Spell
688 | Checker* <http://norvig.com/spell-correct.html>`__!
689 | 
690 | But I swear it's not! I actually I think I may have out-Norvig'ed Peter
691 | Norvig when it comes to describing `conditional
692 | probability <http://en.wikipedia.org/wiki/Conditional_probability>`__:
693 | P(wordA & wordB) = P(wordB \| wordA)\*P(wordA)
694 | 
695 | And another one to Rob Renaud's `Gibberish
696 | Detector <https://github.com/rrenaud/Gibberish-Detector>`__. I, out of
697 | pure chance, ran into his project some time after running into Norvig's
698 | article. I can't describe *how much it helped* to intuitively understand
699 | what the heavy hitters of "AI" consider to be introductory material;
700 | this was greatly needed b/c at the time, I felt overwhelmed by my own
701 | desire to really understand this area, and everything else going on.
702 | 
703 | I do have a second article about this exact thing, only expressed
704 | differently (audience is non-programming), and it may or may not be
705 | posted soon! [STRIKEOUT:Oh and the code too, that is if someone hasn't
706 | gotten to translating the above article to code before I can get to
707 | uploading the project :P I'm trying to get the kinks out of here and the
708 | code so it's simple, duh!]
709 | 
710 | I dedicate this work to my sisters, Cat, Melissa and Christine, and my
711 | favorite brother, Christian :)
712 | 
713 | note 1
714 | ^^^^^^
715 | 
716 | `go back <#explain-like-im-5>`__
717 | 
718 | *To avoid confusion, I wrote this section in the form of a letter to my
719 | younger siblings*
720 | 
721 | note 2
722 | ^^^^^^
723 | 
724 | \*I'm borrowing, what I consider, `one of the most beautiful thought
725 | experiments I've ever heard trying to describe one's
726 | self <https://www.youtube.com/watch?v=pCofmZlC72g#t=1144>`__. I'm a big
727 | fan of Sam Harris's work. Highly recommend!
728 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | *Autocomplete* or: How I learned to stop spelling and love our AI overlords
  2 | ===
  3 | 
  4 | Autocomplete is an adult and kid friendly exercise in creating your own AI program. 
  5 | 
  6 | For those short on time, the [ELI5](#explain-like-im-5) section is devoid of nomenclature but lengthy; the [tl;dr](#tldr) section describes the implementation using the appropriate terms - basic principles of conditional probability, generalized [Markov chain](http://en.wikipedia.org/wiki/Markov_chain) - but is short, concise, and includes references for further reading. 
  7 | 
  8 | ## Skip to:
  9 | 
 10 | * [How to's](#how-to-install)
 11 | * [New: Spell Correction](#spell-correction)
 12 | * [tl;dr?](#tldr)
 13 | * [Motivation](#motivation)
 14 | * [ELI5](#explain-like-im-5)
 15 | * [If you're not 5](#if-youre-not-5)
 16 | * [Afterword](https://github.com/rodricios/autocomplete#afterword)
 17 | 
 18 | 
 19 | ---
 20 | 
 21 | ## How to install:
 22 | 
 23 |     pip install autocomplete
 24 | 
 25 | ## How to use:
 26 | 
 27 | ```python
 28 | import autocomplete
 29 | 
 30 | # load pickled python Counter objects representing our predictive models
 31 | # I use Peter Norvigs big.txt (http://norvig.com/big.txt) to create the predictive models
 32 | autocomplete.load()
 33 | 
 34 | # imagine writing "the b"
 35 | autocomplete.predict('the','b')
 36 | 
 37 | [('blood', 204),
 38 |  ('battle', 185),
 39 |  ('bone', 175),
 40 |  ('best', 149),
 41 |  ('body', 149),
 42 |  ...]
 43 | 
 44 | # now you type an "o"
 45 | 
 46 | autocomplete.predict('the','bo')
 47 | 
 48 | [('bone', 175),
 49 |  ('body', 149),
 50 |  ('bones', 122),
 51 |  ('boy', 46),
 52 |  ('bottom', 32),
 53 |  ('box', 24),
 54 |  ...]
 55 | ```
 56 | 
 57 | ### Spell Correction
 58 | 
 59 | Now say you are in the process of typing "body" (with a preceding "the")
 60 | 
 61 | ```python
 62 | 
 63 | autocomplete.predict('the','bo')
 64 | 
 65 | [('bone', 175),
 66 |  ('body', 149),
 67 |  ('bones', 122),
 68 |  ('boy', 46),
 69 |  ('bottom', 32),
 70 |  ('box', 24),
 71 |  ...]
 72 | 
 73 | ```
 74 | 
 75 | And then you make the fatal error of typing an "f" instead of a "d"
 76 | 
 77 | ```python
 78 | 
 79 | autocomplete.predict('the','bof')
 80 | 
 81 | [('body', 149),
 82 |  ('bottom', 32),
 83 |  ('borzois', 16),
 84 |  ('bottle', 13),
 85 |  ('bodies', 13),
 86 |  ('border', 12)
 87 |  ...]
 88 | 
 89 | ```
 90 | 
 91 | Relax! Autocomplete has you covered. Using a simple ["fat-finger"](http://en.wikipedia.org/wiki/Fat-finger_error) error model,
 92 | you can rest assured that you won't be making [six-hundred billion dollar mistakes](http://www.bbc.com/news/business-29454265) at your Japanese investment firm.
 93 | 
 94 | 
 95 | If you have your own language model in the form described in [ELI5](#explain-like-im-5), then use the *models* submodule to call the training method:
 96 | 
 97 | ```python
 98 | from autocomplete import models
 99 | 
100 | models.train_models('some giant string of text')
101 | 
102 | ```
103 | 
104 | Want to run it as a server (bottlepy required)?
105 | 
106 | ```python
107 | import autocomplete
108 | 
109 | autocomplete.run_server()
110 | 
111 | #output
112 | Bottle v0.12.8 server starting up (using WSGIRefServer())...
113 | Listening on http://localhost:8080/
114 | Hit Ctrl-C to quit.
115 | 
116 | ```
117 | 
118 | Now head over to http://localhost:8080/the/bo
119 | 
120 | ```
121 | http://localhost:8080/the/bo
122 | #output
123 | {"body": 149, "box": 24, "bottom": 32, "boy": 46, "borzois": 16, "bodies": 13, "bottle": 13, "bones": 122, "book": 14, "bone": 175}
124 | 
125 | http://localhost:8080/the/bos
126 | #output
127 | {"boscombe": 11, "boston": 7, "boss": 1, "bosom": 5, "bosses": 4}
128 | ```
129 | 
130 | ### Obligatory tests
131 | ```
132 | python setup.py test
133 | ```
134 | 
135 | 
136 | ---
137 | 
138 | ## [tl;dr](https://github.com/rodricios/autocomplete/blob/master/autocomplete/models.py)
139 | 
140 | The following code excerpt is my interpretation of a series of lessons/concepts expressed in a number of different books.
141 | 
142 | The unifying concept can be said to be [conditional probability](http://en.wikipedia.org/wiki/Conditional_probability):
143 | 
144 |     P(A , B) = P(B | A) * P(A)
145 | 
146 | Which can read as saying:
147 | 
148 |     The probability of A and B occuring is equal to the probability of B occuring, given that A has occured
149 | 
150 | More on this below.
151 | 
152 | ```python
153 | 
154 |     # "preperation" step
155 |     # for every word in corpus, normalize ('The' -> 'the'), insert to list
156 |     WORDS = helpers.re_split(corpus)
157 | 
158 |     # first model -> P(word)
159 |     # Counter constructor will take a list of elements and create a frequency distribution (histogram)
160 |     WORDS_MODEL = collections.Counter(WORDS)
161 | 
162 |     # another preperation step
163 |     # [a,b,c,d] -> [[a,b], [b,c], [c,d]]
164 |     WORD_TUPLES = list(helpers.chunks(WORDS, 2))
165 | 
166 |     # second model -> P(next word | prev. word)
167 |     # I interpret "..| prev. word)" as saying "dictionary key
168 |     # leading to seperate and smaller (than WORDS_MODEL) freq. dist.
169 |     WORD_TUPLES_MODEL = {first:collections.Counter() for first, second in WORD_TUPLES}
170 | 
171 |     for prev_word, next_word in WORD_TUPLES:
172 |         # this is called the "conditioning" step where we assert
173 |         # that the probability space of all possible "next_word"'s
174 |         # is "conditioned" under the event that "prev_word" has occurred
175 |         WORD_TUPLES_MODEL[prev_word].update([next_word])
176 | 
177 | ```
178 | 
179 | Textbooks, and locations therein, where the concept-in-practice has been expressed:
180 | 
181 | I. [Intro to Statistical Natural Language Processing](http://ics.upjs.sk/~pero/web/documents/pillar/Manning_Schuetze_StatisticalNLP.pdf) - Manning, Schütze, 1999
182 | 
183 |     a. frequency distribution showing the most common words and frequencies in *Tom Sawyer*, pg. 21
184 | 
185 |     b. conditional probability definition expressed in page 42 - section 2.1.2
186 | 
187 |     c. the intuition for *frequency* distributions found in pg. 153 (provided in the context of finding [*Collocations*](http://en.wikipedia.org/wiki/Collocation))
188 | 
189 | II. [Probabilistic Graphical Models](http://mitpress.mit.edu/books/probabilistic-graphical-models) - Kohler, Friedman, 2009
190 | 
191 |     a. conditional probability definition found on pg. 18 (hilariously and coincidentally found in section 2.1.2.1)
192 | 
193 | III. [Artificial Intelligence - A Modern Approach](http://aima.cs.berkeley.edu) - Russell, Norvig, 3rd. ed. 2010
194 | 
195 |     a. conditional probability concept explained in pg. 485
196 | 
197 |     b. the "language" (I take to mean "intuition" for asserting things in the probabilistic sense) pg. 486
198 | 
199 |     c. the notion of "conditioning" found in pg. 492-494
200 | 
201 | ## Motivation
202 | 
203 | Similar to the motivation behind [eatiht](https://github.com/rodricios/eatiht#motivation), I found that it took far too long to find a palpable theory-to-application example of what amounts to more than a 500 pages of words across 3 books, each spanning a large index of, in certain cases, *counter-intuitive* nomenclature; read the [light criticisms](http://www.reddit.com/r/MachineLearning/comments/2fxi6v/ama_michael_i_jordan/ckep3z6) made by Michael I. Jordan on the matter (he was recently named [#2 machine learning expert "we need to know" on dataconomy.com](http://dataconomy.com/10-machine-learning-experts-you-need-to-know/)).
204 | 
205 | You can find similar thoughts being expressed [**in an article from 2008 (updated 2009)**](http://brenocon.com/blog/2008/12/statistics-vs-machine-learning-fight/) by [Brennan O'Connor](http://brenocon.com)
206 | 
207 | ---
208 | 
209 | [*This work is dedicated to my siblings*](#note-1).
210 | 
211 | ## Explain like I'm 5[*](#note-1)
212 | 
213 | *Warning! This explanation is literally intended for young kids - I'm actually trying to see if these concepts can be explained to an audience unaware of the nomenclature used within the statistical [nlp](http://en.wikipedia.org/wiki/Natural_language_processing) and other machine learning fields. For example, my 7, 9, 11, 14 y.o. siblings, and basically anyone else who's ever read a story to a child - they would be a part of the target audience.
214 | 
215 | If you've found this readable and informative, please consider putting on the goofiest face and reading this to your kids, if you have any :) If you do, please send me your thoughts on the experience.
216 | 
217 | I'm only interested in lowering the barrier to entry. I should have included this note since the beginning (sorry to those who undoubtedly left with a bad taste in their mouth).
218 | 
219 | You can contact me at rodrigopala91@gmail.com
220 | 
221 | Thanks for reading,
222 | 
223 | Rodrigo
224 | 
225 | ## ELI5
226 | 
227 | No. I'm explaining this like you're 5. I know you're not *5* , *you guys... Chris, stop jumping on your sister's back*!
228 | 
229 | Ok, so I'm saying, *imagine I'm 5!*
230 | 
231 | Oh, that was easy now huh? Let's just forget the *I'm 5* part.
232 | 
233 | Imagine a giant collection of books.
234 | 
235 | For example, all the Harry Potter and Hunger Games novels put together.
236 | 
237 | What if I asked you to go through all the pages and all the words in those pages?
238 | 
239 | Now I'm not asking you *four* to actually *read* the books. You know, just go through, beginning to end, and notice each word.
240 | 
241 | For every new word you see, write it down, and put a "1" next to it, and everytime you see a word *again*, add "1" more to the previous number.
242 | 
243 | So basically I'm asking y'all to keep count of how many times a word comes up.
244 | 
245 | Got it? If yes, cool! If not, find a sibling, friend, or adult near you and ask them to help you out :)
246 | 
247 | ...
248 | 
249 | Say you start with *Harry Potter and the Sorcerer's Stone*:
250 | 
251 |     Mr. and Mrs. Dursley of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much...
252 | 
253 | And imagine that you're on the 5th word. This or something close to this is what you're going for:
254 | 
255 |     Mr.     -> 1
256 |     and     -> 1
257 |     Mrs.    -> 1
258 |     Dursley -> 1
259 |     of      -> 1
260 | 
261 | 
262 | Or if you're a *wannabe-Harry-Potter* fan, ah I'm just kidding!
263 | 
264 | If you started with *the-book-that-must-not-be-named* - I know you guys won't get it, but persons my age will :)
265 | 
266 | Alright! So you started with *The Hunger Games*:
267 | 
268 |     When I wake up, the other side of the bed is cold...
269 | 
270 | By the sixth word you have:
271 | 
272 |     When  -> 1
273 |     I     -> 1
274 |     wake  -> 1
275 |     up    -> 1
276 |     the   -> 1
277 | 
278 | You have a long day ahead of you...
279 | 
280 | ...
281 | 
282 | *1,105,285 words later*
283 | 
284 | Now that you're done tallying up all those words, why not order all these words by the *number of times you've seen them*?
285 | 
286 | See you next week!
287 | 
288 | ...
289 | 
290 | Back so soon? You should have gotten something like this:
291 | 
292 |     psst*, remember, the format is:
293 |      word -> # of times the word appears
294 | 
295 |     'the' -> 80030
296 |     'of'  -> 40025
297 |     'and' -> 38313
298 |     'to'  -> 28766
299 |     'in'  -> 22050
300 |     'a'   -> 21155
301 |     'that'-> 12512
302 |     'he'  -> 12401
303 |     'was' -> 11410
304 |     'it'  -> 10681
305 |     ... there's a lot more words you've tallied up...
306 | 
307 | 
308 | Those were the most common words.
309 | 
310 | Now on the *less-frequent* end, you'll find your words appearing not as often...
311 | 
312 |     ... 29137 words later.
313 |     'przazdziecka' -> 1
314 |     'disclosure'   -> 1
315 |     'galvanism'    -> 1
316 |     'repertoire'   -> 1
317 |     'bravado'      -> 1
318 |     'gal'          -> 1
319 |     'ideological'  -> 1
320 |     'guaiacol'     -> 1
321 |     'expands'      -> 1
322 |     'revolvers'    -> 1
323 | 
324 | Yeah Chris? Oh, 'what does *lez freekend*' mean? Um, so it means something like: *you probably won't hear or read that word very often.*
325 | 
326 | Now what if I asked you to help me find this word I'm looking for? And I know this word starts with the letters: 'th'.
327 | 
328 | I'm pretty sure you guys can do this much faster!
329 | 
330 | ...
331 | 
332 | *5 minutes later!*
333 | 
334 | ...
335 | 
336 | Not bad! You only had to go through 29157 unique words after all!
337 | 
338 | 
339 |     'the'  -> 80030
340 |     'that' -> 12512
341 |     'this' -> 4063
342 |     'they' -> 3938
343 |     'there'-> 2972
344 |     'their'-> 2955
345 |     'them' -> 2241
346 |     'then' -> 1558
347 |     'these'-> 1231
348 |     'than' -> 1206
349 |     ... 229 words more...
350 | 
351 | 
352 | 239 words, still kind of lot though huh? And you know your big brother, he's too lazy to do this work *by hand* (*cough* program it up  *cough*) ;)
353 | 
354 | So the word I'm looking for is on the tip of my tongue. I think the next letter is "i".
355 | 
356 | *1 minute later*
357 | 
358 |     'this'     -> 4063
359 |     'think'    -> 557
360 |     'things'   -> 321
361 |     'thing'    -> 303
362 |     'third'    -> 239
363 |     'thin'     -> 166
364 |     'thinking' -> 137
365 |     'thirty'   -> 123
366 |     'thick'    -> 77
367 |     'thirds'   -> 43
368 |     ... 36 words more...
369 | 
370 | 
371 | *I scan through the first 10 words.* Oh, I just remembered that the next letter is 'r'.
372 | 
373 | *You start taking out even more words.*
374 | 
375 | *10 seconds later.*
376 | 
377 |     'third'      -> 239
378 |     'thirty'     -> 123
379 |     'thirds'     -> 43
380 |     'thirteen'   -> 32
381 |     'thirst'     -> 13
382 |     'thirteenth' -> 11
383 |     'thirdly'    -> 8
384 |     'thirsty'    -> 5
385 |     'thirtieth'  -> 3
386 |     'thirties'   -> 2
387 | 
388 | Aha, 'thirdly' was the word I was looking for! What, you never heard of the word "thirdly" before?
389 | 
390 | Now you might be saying to yourself, "*that's pretty cool!*", and you're right!
391 | 
392 | And you know what's cooler? *Making everyone's life a tiny bit easier* is! :)
393 | 
394 | But how can you do that with just *words*?
395 | 
396 | Aren't words boring and dull?
397 | 
398 | It's like all we do is talk, write, and think with *words*. I mean, how lame, I can't even describe to you this *autocomplete* thing-slash-idea-thing without having to write it out with *words*!
399 | 
400 | Ugh! I hate words!
401 | 
402 | *Whoah, wait a minute! That was not cool of me! Let's relax for a minute.*
403 | 
404 | Let's try to give an imaginary hug to the word-factory in our brains. That part of our brain works so hard, even when we don't ask it to. How nice of our brain to do that. Not!
405 | 
406 | What I'm trying to say is that sometimes it's not very nice for our brains to distract us, especially when we have homework or other, real-world problems like adult-homework.
407 | 
408 | ...
409 | 
410 | So how about this:
411 | 
412 | As a mental exercise, let's just try to think about *what* the next sentence coming out of our own mouths *will be*[\*](#note-2).
413 | 
414 | Now if you're thinking about what will be coming out of my mouth, or out of your mouth, or your mouth, or your mouth, or your mouth, you're doing it wrong! (to readers who aren't one of my 4 younger siblings, that's how many I have).
415 | 
416 | Try your best to think about *what* the next sentence coming out of *your own* mouth will be.
417 | 
418 | ...
419 | 
420 | Did you decide on your sentence? Good!
421 | 
422 | Now what if I asked you to give me two **good** reasons explaining *why* and *how* you chose the sentence you chose?
423 | 
424 | Wait, I can't even do that! Let's make it easier on ourselves. Let's try to only answer *why* and *how* we chose just the first word.
425 | 
426 | Still pretty hard huh?
427 | 
428 | If you thought it was pretty darn hard to give a *good and honest* reason as to why it is you chose the word you chose, it's alright. :)
429 | 
430 | But like all couch-scientists, let's just make a guess! My guess is: our brain is a **probabilistic machine**.
431 | 
432 | If you feel like you don't *get* what the word "probabilisitic" or "probability" means, sure you do! Just use the word "probably" in one of your sentences, but try to make some sense.
433 | 
434 | Ok, so what do I mean? Well, let's just consider the English language. Like most other things, the English language has rules.
435 | 
436 | The kind of rules that can be simplified down to:
437 | 
438 | 1) "***something*** *action* ***something***".
439 | 
440 | 2) Replace ***something***'s and ***action*** with words that make sense to you.
441 | 
442 | Fair enough, right?
443 | 
444 | Now imagine that your brain essentially has those rules "branded" or "recorded" into itself. Ok, so now I'm starting to not make much sense huh?
445 | 
446 | How about this? How many times have you heard,
447 | 
448 | "**Do** your **bed**!"
449 | 
450 | "**Brush** your **teeth**!"
451 | 
452 | "**Let's** get **food**!"
453 | 
454 | While each one of you guys may have not heard those *exact* sentences, what I'm trying to say makes sense right? *That you probably heard certain sentences more often than others?*
455 | 
456 | ...
457 | 
458 | Now, imagine you could put "pause" right after the first word that comes out of your mouth.
459 | 
460 | Let's just say that first word is "the".
461 | 
462 | Now in the case that you stuttered for reasons outside your conscientious control (for example: "thhh thhe the"). No big deal, you meant to say "the", so let's *flatten* it to just that!
463 | 
464 | With that *word* said, what words do you *think* you might have said after it?
465 | 
466 | You might tell me, "*any word I want!*
467 | 
468 | Of course you could have! I bet you spent a millisecond thinking about whether or not the next word you were going to say was going to be: *guaiacol*.
469 | 
470 | I *know* because I thought about using that word too!
471 | 
472 | I can remember the first time I heard (or read) *guaiacol* like it was yesterday. I read it in some funky article on the internet. I found the word in a list of words that don't appear too often in the English language.
473 | 
474 | After I read it, I was able to fit *guaiacol* nicely into that part of my brain where I... uhh.. was... able... uhh...
475 | 
476 | Oh, you *know*, that place in my brain where I get to choose whether I want to say *the apple*, *the automobile*, *the austronaut*, etc.
477 | 
478 | ...
479 | 
480 | Ok, so clearly I'm no brainician, and that may or may not be the way our brain works.
481 | 
482 | But even though that idea might be wrong, the idea itself sounds like a pretty darn good way of suggesting the next word or words somebody is trying to *type*.
483 | 
484 | What if you had a way to count the number of times you've heard "apple" said after the word "the"?
485 | 
486 | Ask yourself the same question, but now with the word "automobile" instead of "apple".
487 | 
488 | What if you had the time to think about every possible word that you've ever heard spoken after the word "the"? I'd say it might have looked something like this:
489 | 
490 |     Words you might have heard following the word "the" and the number of times you might have heard it
491 | 
492 |     'same'     -> 996
493 |     'french'   -> 688
494 |     'first'    -> 652
495 |     'old'      -> 591
496 |     'emperor'  -> 581
497 |     'other'    -> 528
498 |     'whole'    -> 500
499 |     'united'   -> 466
500 |     'room'     -> 376
501 |     'most'     -> 373
502 | 
503 |     ... 9331 more words...
504 | 
505 | Not impressed with your brain yet? Let's continue this little thought experiment further.
506 | 
507 | Imagine that you just said "the", and you could put pause after the first *letter* of the next word out of your mouth: "h".
508 | 
509 | Real quick, think of the shortest amount of time you can think of. Think of the shortest *second* you can think of. Now shorter than that too.
510 | 
511 | At this point, you can't even call that length of time a *second*. But in that length of time, your brain may have just done this:
512 | 
513 |     Every word you've ever heard coming after the word "the":
514 | 
515 |     'house'   -> 284
516 |     'head'    -> 117
517 |     'hands'   -> 101
518 |     'hand'    -> 97
519 |     'horses'  -> 71
520 |     'hill'    -> 64
521 |     'highest' -> 64
522 |     'high'    -> 57
523 |     'history' -> 56
524 |     'heart'   -> 55
525 | 
526 | And that brain you got did this realllllyyyyyy fast. Faster than Google, Bing, Yahoo and any other company can ever hope to beat. And your brain did this without even asking for your permission. I think our brains are trying to control us you guys, oh no!
527 | 
528 | ...
529 | 
530 | Thanks for reading this far folks. Please go to the [afterword](https://github.com/rodricios/autocomplete#afterword) for some of the resources I've found useful in both building the intuition, and writing this article.
531 | 
532 | Also, if it's not too much to ask, consider following me or tweeting this to your friends and/or family, any support is appreciated :)
533 | 
534 | - [@rodricios](https://twitter.com/rodricios)
535 | 
536 | ## If you're not 5
537 | 
538 | The basic idea is this:
539 | 
540 | Assume you have a large collection of English-understandable text merged into a single string.
541 | 
542 | Start by transforming that string into a list of words (AKA *ngrams of word-legth*), and also (but not required) normalize each word ('The' -> 'the').
543 | 
544 | Once you have a normalized list of words, you can start building a frequency distribution measuring the frequency of each word.
545 | 
546 | ...
547 | 
548 | At this point you can start "predict" the "final state" of a word-in-progress. But consider the case where a user types in some query box:
549 | 
550 |     "The th"
551 | 
552 | And he intends to write:
553 | 
554 |     "The third"
555 | 
556 | With the above predictive model, you'll be suggesting something like:
557 | 
558 |     [
559 |         ('the', 80030),
560 |         ('they', 3938),
561 |         ('there', 2972),
562 |         ...
563 |     ]
564 | 
565 | This explains one specific type of predictive model, which can be written as P(word), and you've just seen the pitfalls of using **just** this model.
566 | 
567 | Now for the next word, ask yourself, what's the probability that I'm going to type the word "apple" given that I wrote "tasty"?
568 | 
569 | In machine learning and AI books, you'll be presented *Conditional Probability* with the following equation:
570 | 
571 |     P(word A and word B) = P(word B | word A) * P(word A)
572 | 
573 | That equation addresses the problem that I mentioned.
574 | 
575 | We've handled P(wordA) already.
576 | 
577 | To handle P(word B | word A), which reads *probability of word A given word B *, I take a *literall* interpretation of the word "given", in that context, to mean the following:
578 | 
579 | *"word A" is the key pointing to a probability distribution representing all the words that follow "word A"*
580 | 
581 | Once we can represent this second model, we can also apply the *filtering* step - given that we know more letters in the second word, we can zone in on more precise suggestions.
582 | 
583 | ---
584 | 
585 | ### Afterword
586 | 
587 | notes: \*I have to give a shout out to [Sam Harris](https://twitter.com/SamHarrisOrg) for being, AFAIK, the first person or one of the firsts, in [wonderfully putting into words](https://www.youtube.com/watch?v=pCofmZlC72g#t=1144) what I've borrowed and slightly adapted for this writing. [I highly recommend his work](http://www.samharris.org/)
588 | 
589 | Another shoutout to [Peter Norvig](http://norvig.com) for inspiring me and probably many others with our own little "toy" programs. His *Occam's Razor* approach to problem solving will likely cause some confusion as it may appear that my work is an almost full on copy-paste of his [*How to Write a Spell Checker*](http://norvig.com/spell-correct.html)!
590 | 
591 | But I swear it's not! I actually I think I may have out-Norvig'ed Peter Norvig when it comes to describing [conditional probability](http://en.wikipedia.org/wiki/Conditional_probability): P(wordA & wordB) = P(wordB | wordA)\*P(wordA)
592 | 
593 | And another one to Rob Renaud's [Gibberish Detector](https://github.com/rrenaud/Gibberish-Detector). I, out of pure chance, ran into his project some time after running into Norvig's article. I can't describe *how much it helped* to intuitively understand what the heavy hitters of "AI" consider to be introductory material; this was greatly needed b/c at the time, I felt overwhelmed by my own desire to really understand this area, and everything else going on.
594 | 
595 | I do have a second article about this exact thing, only expressed differently (audience is non-programming), and it may or may not be posted soon! ~~Oh and the code too, that is if someone hasn't gotten to translating the above article to code before I can get to uploading the project :P I'm trying to get the kinks out of here and the code so it's simple, duh!~~
596 | 
597 | I dedicate this work to my sisters, Cat, Melissa and Christine, and my favorite brother, Christian :)
598 | 
599 | #### note 1
600 | 
601 | [go back](#explain-like-im-5)
602 | 
603 | *To avoid confusion, I wrote this section in the form of a letter to my younger siblings*
604 | 
605 | #### note 2
606 | 
607 | *I'm borrowing, what I consider, [one of the most beautiful thought experiments I've ever heard trying to describe one's self](https://www.youtube.com/watch?v=pCofmZlC72g#t=1144). I'm a big fan of Sam Harris's work. Highly recommend!
608 | 


--------------------------------------------------------------------------------