├── data ├── README.md ├── speaker.csv ├── test_words.csv └── train_words.csv ├── my_recognizer.py ├── asl_test_recognizer.py ├── asl_test_model_selectors.py ├── README.md ├── asl_test.py ├── my_model_selectors.py ├── asl_utils.py ├── asl_data.py └── asl_recognizer.ipynb /data/README.md: -------------------------------------------------------------------------------- 1 | ## American Sign Language Data 2 | The data in this directory contained in `hands_condensed.csv` and `speaker.csv` has been derived from the [RWTH-BOSTON-104 Database](http://www-i6.informatik.rwth-aachen.de/~dreuw/database-rwth-boston-104.php) The hand positions are pulled directly from the database [boston104.handpositions.rybach-forster-dreuw-2009-09-25.full.xml](boston104.handpositions.rybach-forster-dreuw-2009-09-25.full.xml). 3 | 4 | The videos are sentences with translations provided in the database. For purposes of this project, the sentences have been segmented into words based on slow motion examination of the files. These segments are provided in the `test_words.csv` and `train_words.csv` files in the form of start and end frames (inclusive). Training and Test word files have been divided as they are in the database, which is based on sentence Train and Test divisions. The hand positions file has not been divided and contains all frame information. 5 | -------------------------------------------------------------------------------- /my_recognizer.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from asl_data import SinglesData 3 | 4 | 5 | def recognize(models: dict, test_set: SinglesData): 6 | """ Recognize test word sequences from word models set 7 | 8 | :param models: dict of trained models 9 | {'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...} 10 | :param test_set: SinglesData object 11 | :return: (list, list) as probabilities, guesses 12 | both lists are ordered by the test set word_id 13 | probabilities is a list of dictionaries where each key a word and value is Log Liklihood 14 | [{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, 15 | {SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... }, 16 | ] 17 | guesses is a list of the best guess words ordered by the test set word_id 18 | ['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...] 19 | """ 20 | warnings.filterwarnings("ignore", category=DeprecationWarning) 21 | probabilities = [] 22 | guesses = [] 23 | # TODO implement the recognizer 24 | # return probabilities, guesses 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /asl_test_recognizer.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from asl_data import AslDb 4 | from asl_utils import train_all_words 5 | from my_model_selectors import SelectorConstant 6 | from my_recognizer import recognize 7 | 8 | FEATURES = ['right-y', 'right-x'] 9 | 10 | class TestRecognize(TestCase): 11 | def setUp(self): 12 | self.asl = AslDb() 13 | self.training_set = self.asl.build_training(FEATURES) 14 | self.test_set = self.asl.build_test(FEATURES) 15 | self.models = train_all_words(self.training_set, SelectorConstant) 16 | 17 | def test_recognize_probabilities_interface(self): 18 | probs, _ = recognize(self.models, self.test_set) 19 | self.assertEqual(len(probs), self.test_set.num_items, "Number of test items in probabilities list incorrect.") 20 | self.assertEqual(len(probs[0]), self.training_set.num_items, 21 | "Number of training word probabilities in test item dictionary incorrect.") 22 | self.assertEqual(len(probs[-1]), self.training_set.num_items, 23 | "Number of training word probabilities in test item dictionary incorrect.") 24 | self.assertIn('FRANK', probs[0], "Dictionary of probabilities does not contain correct keys") 25 | self.assertIn('CHICKEN', probs[-1], "Dictionary of probabilities does not contain correct keys") 26 | 27 | def test_recognize_guesses_interface(self): 28 | _, guesses = recognize(self.models, self.test_set) 29 | self.assertEqual(len(guesses), self.test_set.num_items, "Number of test items in guesses list incorrect.") 30 | self.assertIsInstance(guesses[0], str, "The guesses are not strings") 31 | self.assertIsInstance(guesses[-1], str, "The guesses are not strings") 32 | 33 | -------------------------------------------------------------------------------- /asl_test_model_selectors.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from asl_data import AslDb 4 | from my_model_selectors import ( 5 | SelectorConstant, SelectorBIC, SelectorDIC, SelectorCV, 6 | ) 7 | 8 | FEATURES = ['right-y', 'right-x'] 9 | 10 | class TestSelectors(TestCase): 11 | def setUp(self): 12 | asl = AslDb() 13 | self.training = asl.build_training(FEATURES) 14 | self.sequences = self.training.get_all_sequences() 15 | self.xlengths = self.training.get_all_Xlengths() 16 | 17 | def test_select_constant_interface(self): 18 | model = SelectorConstant(self.sequences, self.xlengths, 'BUY').select() 19 | self.assertGreaterEqual(model.n_components, 2) 20 | model = SelectorConstant(self.sequences, self.xlengths, 'BOOK').select() 21 | self.assertGreaterEqual(model.n_components, 2) 22 | 23 | def test_select_bic_interface(self): 24 | model = SelectorBIC(self.sequences, self.xlengths, 'FRANK').select() 25 | self.assertGreaterEqual(model.n_components, 2) 26 | model = SelectorBIC(self.sequences, self.xlengths, 'VEGETABLE').select() 27 | self.assertGreaterEqual(model.n_components, 2) 28 | 29 | def test_select_cv_interface(self): 30 | model = SelectorCV(self.sequences, self.xlengths, 'JOHN').select() 31 | self.assertGreaterEqual(model.n_components, 2) 32 | model = SelectorCV(self.sequences, self.xlengths, 'CHICKEN').select() 33 | self.assertGreaterEqual(model.n_components, 2) 34 | 35 | def test_select_dic_interface(self): 36 | model = SelectorDIC(self.sequences, self.xlengths, 'MARY').select() 37 | self.assertGreaterEqual(model.n_components, 2) 38 | model = SelectorDIC(self.sequences, self.xlengths, 'TOY').select() 39 | self.assertGreaterEqual(model.n_components, 2) 40 | -------------------------------------------------------------------------------- /data/speaker.csv: -------------------------------------------------------------------------------- 1 | video,speaker 2 | 1,woman-1 3 | 3,woman-2 4 | 4,woman-1 5 | 5,woman-2 6 | 6,woman-2 7 | 8,man-1 8 | 9,woman-2 9 | 10,woman-1 10 | 11,woman-2 11 | 13,woman-2 12 | 14,woman-2 13 | 15,woman-2 14 | 16,man-1 15 | 17,woman-2 16 | 18,woman-1 17 | 19,woman-2 18 | 20,woman-2 19 | 22,woman-2 20 | 23,woman-2 21 | 24,woman-2 22 | 26,woman-2 23 | 27,woman-2 24 | 29,man-1 25 | 31,man-1 26 | 32,woman-2 27 | 33,woman-2 28 | 34,woman-2 29 | 35,man-1 30 | 37,woman-2 31 | 38,woman-2 32 | 39,man-1 33 | 41,woman-2 34 | 42,woman-1 35 | 44,woman-2 36 | 45,woman-1 37 | 46,woman-1 38 | 47,woman-2 39 | 48,woman-1 40 | 49,woman-1 41 | 51,woman-1 42 | 52,woman-1 43 | 53,woman-2 44 | 55,woman-2 45 | 56,man-1 46 | 58,man-1 47 | 59,woman-1 48 | 60,woman-1 49 | 61,woman-1 50 | 62,woman-1 51 | 63,woman-1 52 | 64,woman-1 53 | 65,woman-1 54 | 66,woman-1 55 | 68,woman-2 56 | 69,woman-2 57 | 70,woman-2 58 | 72,woman-2 59 | 73,man-1 60 | 75,woman-2 61 | 76,woman-2 62 | 78,woman-2 63 | 79,woman-2 64 | 80,woman-2 65 | 81,woman-2 66 | 82,woman-1 67 | 83,woman-1 68 | 85,woman-1 69 | 86,woman-1 70 | 87,woman-2 71 | 88,woman-2 72 | 91,woman-2 73 | 93,man-1 74 | 94,woman-2 75 | 95,man-1 76 | 96,man-1 77 | 97,woman-2 78 | 98,woman-1 79 | 99,woman-1 80 | 101,woman-1 81 | 102,woman-2 82 | 103,woman-1 83 | 104,woman-1 84 | 106,man-1 85 | 109,woman-2 86 | 110,woman-2 87 | 111,woman-2 88 | 112,man-1 89 | 114,man-1 90 | 115,woman-2 91 | 116,man-1 92 | 117,man-1 93 | 118,man-1 94 | 120,man-1 95 | 121,woman-2 96 | 123,woman-1 97 | 124,man-1 98 | 125,woman-1 99 | 126,woman-1 100 | 127,man-1 101 | 128,woman-1 102 | 129,man-1 103 | 130,man-1 104 | 131,man-1 105 | 132,man-1 106 | 133,man-1 107 | 134,woman-1 108 | 135,woman-1 109 | 136,woman-1 110 | 137,man-1 111 | 138,woman-1 112 | 140,woman-1 113 | 141,woman-1 114 | 143,man-1 115 | 144,woman-1 116 | 145,woman-1 117 | 146,woman-1 118 | 147,woman-1 119 | 148,man-1 120 | 149,man-1 121 | 150,woman-1 122 | 151,woman-1 123 | 152,man-1 124 | 153,man-1 125 | 154,woman-1 126 | 155,man-1 127 | 156,woman-1 128 | 157,man-1 129 | 159,man-1 130 | 160,woman-1 131 | 161,woman-1 132 | 162,woman-1 133 | 163,woman-1 134 | 164,woman-1 135 | 165,man-1 136 | 166,man-1 137 | 168,man-1 138 | 169,man-1 139 | 170,man-1 140 | 172,man-1 141 | 173,man-1 142 | 175,man-1 143 | 176,man-1 144 | 177,man-1 145 | 178,man-1 146 | 179,man-1 147 | 180,man-1 148 | 182,man-1 149 | 183,man-1 150 | 185,man-1 151 | 186,man-1 152 | 187,man-1 153 | 188,man-1 154 | 190,man-1 155 | 191,man-1 156 | 192,man-1 157 | 194,woman-1 158 | 195,woman-2 159 | 196,man-1 160 | 197,man-1 161 | 198,man-1 162 | 200,woman-1 163 | 2,woman-1 164 | 7,man-1 165 | 12,woman-2 166 | 21,woman-2 167 | 25,woman-2 168 | 28,woman-2 169 | 30,man-1 170 | 36,man-1 171 | 40,man-1 172 | 43,woman-1 173 | 50,woman-1 174 | 54,woman-2 175 | 57,man-1 176 | 67,woman-1 177 | 71,woman-2 178 | 74,man-1 179 | 77,woman-2 180 | 84,woman-1 181 | 89,woman-2 182 | 90,man-1 183 | 92,woman-2 184 | 100,woman-1 185 | 105,woman-1 186 | 107,man-1 187 | 108,woman-2 188 | 113,man-1 189 | 119,man-1 190 | 122,woman-2 191 | 139,man-1 192 | 142,woman-1 193 | 158,man-1 194 | 167,man-1 195 | 171,man-1 196 | 174,man-1 197 | 181,man-1 198 | 184,man-1 199 | 189,man-1 200 | 193,man-1 201 | 199,woman-1 202 | 201,woman-2 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Artificial Intelligence Engineer Nanodegree 2 | ## Probabilistic Models 3 | ## Project: Sign Language Recognition System 4 | 5 | ### Install 6 | 7 | This project requires **Python 3** and the following Python libraries installed: 8 | 9 | - [NumPy](http://www.numpy.org/) 10 | - [SciPy](https://www.scipy.org/) 11 | - [scikit-learn](http://scikit-learn.org/0.17/install.html) 12 | - [pandas](http://pandas.pydata.org/) 13 | - [matplotlib](http://matplotlib.org/) 14 | - [jupyter](http://ipython.org/notebook.html) 15 | - [hmmlearn](http://hmmlearn.readthedocs.io/en/latest/) 16 | 17 | Notes: 18 | 1. It is highly recommended that you install the [Anaconda](http://continuum.io/downloads) distribution of Python and load the environment included in the "Your conda env for AI ND" lesson. 19 | 2. The most recent development version of hmmlearn, 0.2.1, contains a bugfix related to the log function, which is used in this project. In order to install this version of hmmearn, install it directly from its repo with the following command from within your activated Anaconda environment: 20 | ```sh 21 | pip install git+https://github.com/hmmlearn/hmmlearn.git 22 | ``` 23 | 24 | ### Code 25 | 26 | A template notebook is provided as `asl_recognizer.ipynb`. The notebook is a combination tutorial and submission document. Some of the codebase and some of your implementation will be external to the notebook. For submission, complete the **Submission** sections of each part. This will include running your implementations in code notebook cells, answering analysis questions, and passing provided unit tests provided in the codebase and called out in the notebook. 27 | 28 | ### Run 29 | 30 | In a terminal or command window, navigate to the top-level project directory `AIND_recognizer/` (that contains this README) and run one of the following command: 31 | 32 | `jupyter notebook asl_recognizer.ipynb` 33 | 34 | This will open the Jupyter Notebook software and notebook in your browser. Follow the instructions in the notebook for completing the project. 35 | 36 | 37 | ### Additional Information 38 | ##### Provided Raw Data 39 | 40 | The data in the `asl_recognizer/data/` directory was derived from 41 | the [RWTH-BOSTON-104 Database](http://www-i6.informatik.rwth-aachen.de/~dreuw/database-rwth-boston-104.php). 42 | The handpositions (`hand_condensed.csv`) are pulled directly from 43 | the database [boston104.handpositions.rybach-forster-dreuw-2009-09-25.full.xml](boston104.handpositions.rybach-forster-dreuw-2009-09-25.full.xml). The three markers are: 44 | 45 | * 0 speaker's left hand 46 | * 1 speaker's right hand 47 | * 2 speaker's nose 48 | * X and Y values of the video frame increase left to right and top to bottom. 49 | 50 | Take a look at the sample [ASL recognizer video](http://www-i6.informatik.rwth-aachen.de/~dreuw/download/021.avi) 51 | to see how the hand locations are tracked. 52 | 53 | The videos are sentences with translations provided in the database. 54 | For purposes of this project, the sentences have been pre-segmented into words 55 | based on slow motion examination of the files. 56 | These segments are provided in the `train_words.csv` and `test_words.csv` files 57 | in the form of start and end frames (inclusive). 58 | 59 | The videos in the corpus include recordings from three different ASL speakers. 60 | The mappings for the three speakers to video are included in the `speaker.csv` 61 | file. 62 | -------------------------------------------------------------------------------- /asl_test.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from asl_data import AslDb 4 | from asl_utils import train_all_words 5 | from my_model_selectors import ( 6 | SelectorConstant, SelectorBIC, SelectorDIC, SelectorCV, 7 | ) 8 | from my_recognizer import recognize 9 | 10 | """ DEPRECATED MODULE 11 | This module has been split into two new modules: asl_test_model_selectors.py and asl_test_recognizer.py 12 | This module is included in the repo for the sake of legacy code that still uses it. 13 | """ 14 | FEATURES = ['right-y', 'right-x'] 15 | 16 | 17 | class TestSelectors(TestCase): 18 | def setUp(self): 19 | asl = AslDb() 20 | self.training = asl.build_training(FEATURES) 21 | self.sequences = self.training.get_all_sequences() 22 | self.xlengths = self.training.get_all_Xlengths() 23 | 24 | def test_select_constant_interface(self): 25 | model = SelectorConstant(self.sequences, self.xlengths, 'BUY').select() 26 | self.assertGreaterEqual(model.n_components, 2) 27 | model = SelectorConstant(self.sequences, self.xlengths, 'BOOK').select() 28 | self.assertGreaterEqual(model.n_components, 2) 29 | 30 | def test_select_bic_interface(self): 31 | model = SelectorBIC(self.sequences, self.xlengths, 'FRANK').select() 32 | self.assertGreaterEqual(model.n_components, 2) 33 | model = SelectorBIC(self.sequences, self.xlengths, 'VEGETABLE').select() 34 | self.assertGreaterEqual(model.n_components, 2) 35 | 36 | def test_select_cv_interface(self): 37 | model = SelectorCV(self.sequences, self.xlengths, 'JOHN').select() 38 | self.assertGreaterEqual(model.n_components, 2) 39 | model = SelectorCV(self.sequences, self.xlengths, 'CHICKEN').select() 40 | self.assertGreaterEqual(model.n_components, 2) 41 | 42 | def test_select_dic_interface(self): 43 | model = SelectorDIC(self.sequences, self.xlengths, 'MARY').select() 44 | self.assertGreaterEqual(model.n_components, 2) 45 | model = SelectorDIC(self.sequences, self.xlengths, 'TOY').select() 46 | self.assertGreaterEqual(model.n_components, 2) 47 | 48 | 49 | class TestRecognize(TestCase): 50 | def setUp(self): 51 | self.asl = AslDb() 52 | self.training_set = self.asl.build_training(FEATURES) 53 | self.test_set = self.asl.build_test(FEATURES) 54 | self.models = train_all_words(self.training_set, SelectorConstant) 55 | 56 | def test_recognize_probabilities_interface(self): 57 | probs, _ = recognize(self.models, self.test_set) 58 | self.assertEqual(len(probs), self.test_set.num_items, "Number of test items in probabilities list incorrect.") 59 | self.assertEqual(len(probs[0]), self.training_set.num_items, 60 | "Number of training word probabilities in test item dictionary incorrect.") 61 | self.assertEqual(len(probs[-1]), self.training_set.num_items, 62 | "Number of training word probabilities in test item dictionary incorrect.") 63 | self.assertIn('FRANK', probs[0], "Dictionary of probabilities does not contain correct keys") 64 | self.assertIn('CHICKEN', probs[-1], "Dictionary of probabilities does not contain correct keys") 65 | 66 | def test_recognize_guesses_interface(self): 67 | _, guesses = recognize(self.models, self.test_set) 68 | self.assertEqual(len(guesses), self.test_set.num_items, "Number of test items in guesses list incorrect.") 69 | self.assertIsInstance(guesses[0], str, "The guesses are not strings") 70 | self.assertIsInstance(guesses[-1], str, "The guesses are not strings") 71 | -------------------------------------------------------------------------------- /my_model_selectors.py: -------------------------------------------------------------------------------- 1 | import math 2 | import statistics 3 | import warnings 4 | 5 | import numpy as np 6 | from hmmlearn.hmm import GaussianHMM 7 | from sklearn.model_selection import KFold 8 | from asl_utils import combine_sequences 9 | 10 | 11 | class ModelSelector(object): 12 | ''' 13 | base class for model selection (strategy design pattern) 14 | ''' 15 | 16 | def __init__(self, all_word_sequences: dict, all_word_Xlengths: dict, this_word: str, 17 | n_constant=3, 18 | min_n_components=2, max_n_components=10, 19 | random_state=14, verbose=False): 20 | self.words = all_word_sequences 21 | self.hwords = all_word_Xlengths 22 | self.sequences = all_word_sequences[this_word] 23 | self.X, self.lengths = all_word_Xlengths[this_word] 24 | self.this_word = this_word 25 | self.n_constant = n_constant 26 | self.min_n_components = min_n_components 27 | self.max_n_components = max_n_components 28 | self.random_state = random_state 29 | self.verbose = verbose 30 | 31 | def select(self): 32 | raise NotImplementedError 33 | 34 | def base_model(self, num_states): 35 | # with warnings.catch_warnings(): 36 | warnings.filterwarnings("ignore", category=DeprecationWarning) 37 | # warnings.filterwarnings("ignore", category=RuntimeWarning) 38 | try: 39 | hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, 40 | random_state=self.random_state, verbose=False).fit(self.X, self.lengths) 41 | if self.verbose: 42 | print("model created for {} with {} states".format(self.this_word, num_states)) 43 | return hmm_model 44 | except: 45 | if self.verbose: 46 | print("failure on {} with {} states".format(self.this_word, num_states)) 47 | return None 48 | 49 | 50 | class SelectorConstant(ModelSelector): 51 | """ select the model with value self.n_constant 52 | 53 | """ 54 | 55 | def select(self): 56 | """ select based on n_constant value 57 | 58 | :return: GaussianHMM object 59 | """ 60 | best_num_components = self.n_constant 61 | return self.base_model(best_num_components) 62 | 63 | 64 | class SelectorBIC(ModelSelector): 65 | """ select the model with the lowest Baysian Information Criterion(BIC) score 66 | 67 | http://www2.imm.dtu.dk/courses/02433/doc/ch6_slides.pdf 68 | Bayesian information criteria: BIC = -2 * logL + p * logN 69 | """ 70 | 71 | def select(self): 72 | """ select the best model for self.this_word based on 73 | BIC score for n between self.min_n_components and self.max_n_components 74 | 75 | :return: GaussianHMM object 76 | """ 77 | warnings.filterwarnings("ignore", category=DeprecationWarning) 78 | 79 | # TODO implement model selection based on BIC scores 80 | raise NotImplementedError 81 | 82 | 83 | class SelectorDIC(ModelSelector): 84 | ''' select best model based on Discriminative Information Criterion 85 | 86 | Biem, Alain. "A model selection criterion for classification: Application to hmm topology optimization." 87 | Document Analysis and Recognition, 2003. Proceedings. Seventh International Conference on. IEEE, 2003. 88 | http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.58.6208&rep=rep1&type=pdf 89 | DIC = log(P(X(i)) - 1/(M-1)SUM(log(P(X(all but i)) 90 | ''' 91 | 92 | def select(self): 93 | warnings.filterwarnings("ignore", category=DeprecationWarning) 94 | 95 | # TODO implement model selection based on DIC scores 96 | raise NotImplementedError 97 | 98 | 99 | class SelectorCV(ModelSelector): 100 | ''' select best model based on average log Likelihood of cross-validation folds 101 | 102 | ''' 103 | 104 | def select(self): 105 | warnings.filterwarnings("ignore", category=DeprecationWarning) 106 | 107 | # TODO implement model selection using CV 108 | raise NotImplementedError 109 | -------------------------------------------------------------------------------- /asl_utils.py: -------------------------------------------------------------------------------- 1 | from asl_data import SinglesData, WordsData 2 | import numpy as np 3 | from IPython.core.display import display, HTML 4 | 5 | RAW_FEATURES = ['left-x', 'left-y', 'right-x', 'right-y'] 6 | GROUND_FEATURES = ['grnd-rx', 'grnd-ry', 'grnd-lx', 'grnd-ly'] 7 | 8 | 9 | def show_errors(guesses: list, test_set: SinglesData): 10 | """ Print WER and sentence differences in tabular form 11 | 12 | :param guesses: list of test item answers, ordered 13 | :param test_set: SinglesData object 14 | :return: 15 | nothing returned, prints error report 16 | 17 | WER = (S+I+D)/N but we have no insertions or deletions for isolated words so WER = S/N 18 | """ 19 | S = 0 20 | N = len(test_set.wordlist) 21 | num_test_words = len(test_set.wordlist) 22 | if len(guesses) != num_test_words: 23 | print("Size of guesses must equal number of test words ({})!".format(num_test_words)) 24 | for word_id in range(num_test_words): 25 | if guesses[word_id] != test_set.wordlist[word_id]: 26 | S += 1 27 | 28 | print("\n**** WER = {}".format(float(S) / float(N))) 29 | print("Total correct: {} out of {}".format(N - S, N)) 30 | print('Video Recognized Correct') 31 | print('=====================================================================================================') 32 | for video_num in test_set.sentences_index: 33 | correct_sentence = [test_set.wordlist[i] for i in test_set.sentences_index[video_num]] 34 | recognized_sentence = [guesses[i] for i in test_set.sentences_index[video_num]] 35 | for i in range(len(recognized_sentence)): 36 | if recognized_sentence[i] != correct_sentence[i]: 37 | recognized_sentence[i] = '*' + recognized_sentence[i] 38 | print('{:5}: {:60} {}'.format(video_num, ' '.join(recognized_sentence), ' '.join(correct_sentence))) 39 | 40 | 41 | def getKey(item): 42 | return item[1] 43 | 44 | 45 | def train_all_words(training: WordsData, model_selector): 46 | """ train all words given a training set and selector 47 | 48 | :param training: WordsData object (training set) 49 | :param model_selector: class (subclassed from ModelSelector) 50 | :return: dict of models keyed by word 51 | """ 52 | sequences = training.get_all_sequences() 53 | Xlengths = training.get_all_Xlengths() 54 | model_dict = {} 55 | for word in training.words: 56 | model = model_selector(sequences, Xlengths, word, 57 | n_constant=3).select() 58 | model_dict[word] = model 59 | return model_dict 60 | 61 | 62 | def combine_sequences(split_index_list, sequences): 63 | ''' 64 | concatenate sequences referenced in an index list and returns tuple of the new X,lengths 65 | 66 | useful when recombining sequences split using KFold for hmmlearn 67 | 68 | :param split_index_list: a list of indices as created by KFold splitting 69 | :param sequences: list of feature sequences 70 | :return: tuple of list, list in format of X,lengths use in hmmlearn 71 | ''' 72 | sequences_fold = [sequences[idx] for idx in split_index_list] 73 | X = [item for sublist in sequences_fold for item in sublist] 74 | lengths = [len(sublist) for sublist in sequences_fold] 75 | return X, lengths 76 | 77 | 78 | def putHTML(color, msg): 79 | source = """{}
""".format(color, msg) 80 | return HTML(source) 81 | 82 | 83 | def feedback(passed, failmsg='', passmsg='Correct!'): 84 | if passed: 85 | return putHTML('green', passmsg) 86 | else: 87 | return putHTML('red', failmsg) 88 | 89 | 90 | def test_features_tryit(asl): 91 | print('asl.df sample') 92 | display(asl.df.head()) 93 | sample = asl.df.ix[98, 1][GROUND_FEATURES].tolist() 94 | correct = [9, 113, -12, 119] 95 | failmsg = 'The values returned were not correct. Expected: {} Found: {}'.format(correct, sample) 96 | return feedback(sample == correct, failmsg) 97 | 98 | 99 | def test_std_tryit(df_std): 100 | print('df_std') 101 | display(df_std) 102 | sample = df_std.ix['man-1'][RAW_FEATURES] 103 | correct = [15.154425, 36.328485, 18.901917, 54.902340] 104 | failmsg = 'The raw man-1 values returned were not correct.\nExpected: {} for {}'.format(correct, RAW_FEATURES) 105 | return feedback(np.allclose(sample, correct, .001), failmsg) 106 | -------------------------------------------------------------------------------- /data/test_words.csv: -------------------------------------------------------------------------------- 1 | video,speaker,word,startframe,endframe 2 | 2,woman-1,JOHN,7,20 3 | 2,woman-1,WRITE,23,36 4 | 2,woman-1,HOMEWORK,38,63 5 | 7,man-1,JOHN,22,39 6 | 7,man-1,CAN,42,47 7 | 7,man-1,GO,48,56 8 | 7,man-1,CAN,62,73 9 | 12,woman-2,JOHN,9,15 10 | 12,woman-2,CAN,19,24 11 | 12,woman-2,GO,25,34 12 | 12,woman-2,CAN,35,51 13 | 21,woman-2,JOHN,6,26 14 | 21,woman-2,FISH,33,50 15 | 21,woman-2,WONT,53,60 16 | 21,woman-2,EAT,64,74 17 | 21,woman-2,BUT,78,85 18 | 21,woman-2,CAN,85,90 19 | 21,woman-2,EAT,92,97 20 | 21,woman-2,CHICKEN,99,109 21 | 25,woman-2,JOHN,33,41 22 | 25,woman-2,LIKE,43,48 23 | 25,woman-2,IX,49,62 24 | 25,woman-2,IX,62,68 25 | 25,woman-2,IX,68,79 26 | 28,woman-2,JOHN,5,12 27 | 28,woman-2,LIKE,13,19 28 | 28,woman-2,IX,20,30 29 | 28,woman-2,IX,30,38 30 | 28,woman-2,IX,38,49 31 | 30,man-1,JOHN,9,17 32 | 30,man-1,LIKE,19,23 33 | 30,man-1,IX,24,31 34 | 30,man-1,IX,32,38 35 | 30,man-1,IX,39,45 36 | 36,man-1,MARY,13,32 37 | 36,man-1,VEGETABLE,47,66 38 | 36,man-1,KNOW,68,77 39 | 36,man-1,IX,78,83 40 | 36,man-1,LIKE,85,89 41 | 36,man-1,CORN1,92,112 42 | 40,man-1,JOHN,14,37 43 | 40,man-1,IX,37,48 44 | 40,man-1,THINK,51,58 45 | 40,man-1,MARY,61,75 46 | 40,man-1,LOVE,78,87 47 | 43,woman-1,JOHN,3,10 48 | 43,woman-1,MUST,12,19 49 | 43,woman-1,BUY,22,27 50 | 43,woman-1,HOUSE,30,45 51 | 50,woman-1,FUTURE,13,20 52 | 50,woman-1,JOHN,21,26 53 | 50,woman-1,BUY,29,34 54 | 50,woman-1,CAR,35,41 55 | 50,woman-1,SHOULD,42,57 56 | 54,woman-2,JOHN,4,9 57 | 54,woman-2,SHOULD,10,16 58 | 54,woman-2,NOT,19,26 59 | 54,woman-2,BUY,29,33 60 | 54,woman-2,HOUSE,35,46 61 | 57,man-1,JOHN,0,14 62 | 57,man-1,DECIDE,17,29 63 | 57,man-1,VISIT,29,38 64 | 57,man-1,MARY,39,48 65 | 67,woman-1,JOHN,5,9 66 | 67,woman-1,FUTURE,12,30 67 | 67,woman-1,NOT,33,40 68 | 67,woman-1,BUY,43,49 69 | 67,woman-1,HOUSE,50,64 70 | 71,woman-2,JOHN,3,9 71 | 71,woman-2,WILL,9,17 72 | 71,woman-2,VISIT,17,28 73 | 71,woman-2,MARY,28,40 74 | 74,man-1,JOHN,4,15 75 | 74,man-1,NOT,15,23 76 | 74,man-1,VISIT,25,37 77 | 74,man-1,MARY,39,47 78 | 77,woman-2,ANN,6,17 79 | 77,woman-2,BLAME,17,28 80 | 77,woman-2,MARY,28,47 81 | 84,woman-1,IX-1P,3,6 82 | 84,woman-1,FIND,6,17 83 | 84,woman-1,SOMETHING-ONE,17,29 84 | 84,woman-1,BOOK,33,39 85 | 89,woman-2,JOHN,0,11 86 | 89,woman-2,IX,12,24 87 | 89,woman-2,GIVE,25,38 88 | 89,woman-2,MAN,38,55 89 | 89,woman-2,IX,55,70 90 | 89,woman-2,NEW,82,91 91 | 89,woman-2,COAT,92,103 92 | 90,man-1,JOHN,14,41 93 | 90,man-1,GIVE,44,58 94 | 90,man-1,IX,71,83 95 | 90,man-1,SOMETHING-ONE,83,97 96 | 90,man-1,WOMAN,97,111 97 | 90,man-1,BOOK,116,128 98 | 92,woman-2,JOHN,8,20 99 | 92,woman-2,GIVE,20,40 100 | 92,woman-2,IX,46,72 101 | 92,woman-2,SOMETHING-ONE,70,82 102 | 92,woman-2,WOMAN,83,92 103 | 92,woman-2,BOOK,94,101 104 | 100,woman-1,POSS,22,30 105 | 100,woman-1,NEW,37,47 106 | 100,woman-1,CAR,47,54 107 | 100,woman-1,BREAK-DOWN,56,67 108 | 105,woman-1,JOHN,21,32 109 | 105,woman-1,LEG,35,42 110 | 107,man-1,JOHN,22,38 111 | 107,man-1,POSS,42,49 112 | 107,man-1,FRIEND,50,63 113 | 107,man-1,HAVE,66,72 114 | 107,man-1,CANDY,73,86 115 | 108,woman-2,WOMAN,29,40 116 | 108,woman-2,ARRIVE,43,53 117 | 113,man-1,IX,14,21 118 | 113,man-1,CAR,23,30 119 | 113,man-1,BLUE,31,40 120 | 113,man-1,SUE,40,50 121 | 113,man-1,BUY,52,65 122 | 119,man-1,SUE,20,33 123 | 119,man-1,BUY,35,42 124 | 119,man-1,IX,42,51 125 | 119,man-1,CAR,52,60 126 | 119,man-1,BLUE,62,69 127 | 122,woman-2,JOHN,6,15 128 | 122,woman-2,READ,17,27 129 | 122,woman-2,BOOK,29,35 130 | 139,man-1,JOHN,14,22 131 | 139,man-1,BUY,25,29 132 | 139,man-1,WHAT,31,38 133 | 139,man-1,YESTERDAY,43,54 134 | 139,man-1,BOOK,62,74 135 | 142,woman-1,JOHN,4,13 136 | 142,woman-1,BUY,17,24 137 | 142,woman-1,YESTERDAY,27,36 138 | 142,woman-1,WHAT,40,53 139 | 142,woman-1,BOOK,60,71 140 | 158,man-1,LOVE,11,18 141 | 158,man-1,JOHN,20,30 142 | 158,man-1,WHO,32,44 143 | 167,man-1,JOHN,10,24 144 | 167,man-1,IX,27,33 145 | 167,man-1,SAY,35,40 146 | 167,man-1,LOVE,40,54 147 | 167,man-1,MARY,57,67 148 | 171,man-1,JOHN,16,28 149 | 171,man-1,MARY,35,45 150 | 171,man-1,BLAME,48,56 151 | 174,man-1,PEOPLE,7,20 152 | 174,man-1,GROUP,22,35 153 | 174,man-1,GIVE1,40,69 154 | 174,man-1,JANA,73,92 155 | 174,man-1,TOY,95,104 156 | 181,man-1,JOHN,16,45 157 | 181,man-1,ARRIVE,49,58 158 | 184,man-1,ALL,13,24 159 | 184,man-1,BOY,26,36 160 | 184,man-1,GIVE,40,68 161 | 184,man-1,TEACHER,72,81 162 | 184,man-1,APPLE,85,96 163 | 189,man-1,JOHN,16,30 164 | 189,man-1,GIVE,31,40 165 | 189,man-1,GIRL,42,50 166 | 189,man-1,BOX,52,64 167 | 193,man-1,JOHN,10,25 168 | 193,man-1,GIVE,27,34 169 | 193,man-1,GIRL,36,42 170 | 193,man-1,BOX,46,60 171 | 199,woman-1,LIKE,15,23 172 | 199,woman-1,CHOCOLATE,24,32 173 | 199,woman-1,WHO,36,60 174 | 201,woman-2,JOHN,6,14 175 | 201,woman-2,TELL,17,21 176 | 201,woman-2,MARY,21,28 177 | 201,woman-2,IX-1P,31,41 178 | 201,woman-2,BUY,41,46 179 | 201,woman-2,HOUSE,48,64 180 | -------------------------------------------------------------------------------- /asl_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | class AslDb(object): 8 | """ American Sign Language database drawn from the RWTH-BOSTON-104 frame positional data 9 | 10 | This class has been designed to provide a convenient interface for individual word data for students in the Udacity AI Nanodegree Program. 11 | 12 | For example, to instantiate and load train/test files using a feature_method 13 | definition named features, the following snippet may be used: 14 | asl = AslDb() 15 | asl.build_training(tr_file, features) 16 | asl.build_test(tst_file, features) 17 | 18 | Reference for the original ASL data: 19 | http://www-i6.informatik.rwth-aachen.de/~dreuw/database-rwth-boston-104.php 20 | The sentences provided in the data have been segmented into isolated words for this database 21 | """ 22 | 23 | def __init__(self, 24 | hands_fn=os.path.join('data', 'hands_condensed.csv'), 25 | speakers_fn=os.path.join('data', 'speaker.csv'), 26 | ): 27 | """ loads ASL database from csv files with hand position information by frame, and speaker information 28 | 29 | :param hands_fn: str 30 | filename of hand position csv data with expected format: 31 | video,frame,left-x,left-y,right-x,right-y,nose-x,nose-y 32 | :param speakers_fn: 33 | filename of video speaker csv mapping with expected format: 34 | video,speaker 35 | 36 | Instance variables: 37 | df: pandas dataframe 38 | snippit example: 39 | left-x left-y right-x right-y nose-x nose-y speaker 40 | video frame 41 | 98 0 149 181 170 175 161 62 woman-1 42 | 1 149 181 170 175 161 62 woman-1 43 | 2 149 181 170 175 161 62 woman-1 44 | 45 | """ 46 | self.df = pd.read_csv(hands_fn).merge(pd.read_csv(speakers_fn),on='video') 47 | self.df.set_index(['video','frame'], inplace=True) 48 | 49 | def build_training(self, feature_list, csvfilename =os.path.join('data', 'train_words.csv')): 50 | """ wrapper creates sequence data objects for training words suitable for hmmlearn library 51 | 52 | :param feature_list: list of str label names 53 | :param csvfilename: str 54 | :return: WordsData object 55 | dictionary of lists of feature list sequence lists for each word 56 | {'FRANK': [[[87, 225], [87, 225], ...], [[88, 219], [88, 219], ...]]]} 57 | """ 58 | return WordsData(self, csvfilename, feature_list) 59 | 60 | def build_test(self, feature_method, csvfile=os.path.join('data', 'test_words.csv')): 61 | """ wrapper creates sequence data objects for individual test word items suitable for hmmlearn library 62 | 63 | :param feature_method: Feature function 64 | :param csvfile: str 65 | :return: SinglesData object 66 | dictionary of lists of feature list sequence lists for each indexed 67 | {3: [[[87, 225], [87, 225], ...]]]} 68 | """ 69 | return SinglesData(self, csvfile, feature_method) 70 | 71 | 72 | class WordsData(object): 73 | """ class provides loading and getters for ASL data suitable for use with hmmlearn library 74 | 75 | """ 76 | 77 | def __init__(self, asl:AslDb, csvfile:str, feature_list:list): 78 | """ loads training data sequences suitable for use with hmmlearn library based on feature_method chosen 79 | 80 | :param asl: ASLdata object 81 | :param csvfile: str 82 | filename of csv file containing word training start and end frame data with expected format: 83 | video,speaker,word,startframe,endframe 84 | :param feature_list: list of str feature labels 85 | """ 86 | self._data = self._load_data(asl, csvfile, feature_list) 87 | self._hmm_data = create_hmmlearn_data(self._data) 88 | self.num_items = len(self._data) 89 | self.words = list(self._data.keys()) 90 | 91 | def _load_data(self, asl, fn, feature_list): 92 | """ Consolidates sequenced feature data into a dictionary of words 93 | 94 | :param asl: ASLdata object 95 | :param fn: str 96 | filename of csv file containing word training data 97 | :param feature_list: list of str 98 | :return: dict 99 | """ 100 | tr_df = pd.read_csv(fn) 101 | dict = {} 102 | for i in range(len(tr_df)): 103 | word = tr_df.ix[i,'word'] 104 | video = tr_df.ix[i,'video'] 105 | new_sequence = [] # list of sample lists for a sequence 106 | for frame in range(tr_df.ix[i,'startframe'], tr_df.ix[i,'endframe']+1): 107 | vid_frame = video, frame 108 | sample = [asl.df.ix[vid_frame][f] for f in feature_list] 109 | if len(sample) > 0: # dont add if not found 110 | new_sequence.append(sample) 111 | if word in dict: 112 | dict[word].append(new_sequence) # list of sequences 113 | else: 114 | dict[word] = [new_sequence] 115 | return dict 116 | 117 | def get_all_sequences(self): 118 | """ getter for entire db of words as series of sequences of feature lists for each frame 119 | 120 | :return: dict 121 | dictionary of lists of feature list sequence lists for each word 122 | {'FRANK': [[[87, 225], [87, 225], ...], [[88, 219], [88, 219], ...]]], 123 | ...} 124 | """ 125 | return self._data 126 | 127 | def get_all_Xlengths(self): 128 | """ getter for entire db of words as (X, lengths) tuple for use with hmmlearn library 129 | 130 | :return: dict 131 | dictionary of (X, lengths) tuple, where X is a numpy array of feature lists and lengths is 132 | a list of lengths of sequences within X 133 | {'FRANK': (array([[ 87, 225],[ 87, 225], ... [ 87, 225, 62, 127], [ 87, 225, 65, 128]]), [14, 18]), 134 | ...} 135 | """ 136 | return self._hmm_data 137 | 138 | def get_word_sequences(self, word:str): 139 | """ getter for single word series of sequences of feature lists for each frame 140 | 141 | :param word: str 142 | :return: list 143 | lists of feature list sequence lists for given word 144 | [[[87, 225], [87, 225], ...], [[88, 219], [88, 219], ...]]] 145 | """ 146 | return self._data[word] 147 | 148 | def get_word_Xlengths(self, word:str): 149 | """ getter for single word (X, lengths) tuple for use with hmmlearn library 150 | 151 | :param word: 152 | :return: (list, list) 153 | (X, lengths) tuple, where X is a numpy array of feature lists and lengths is 154 | a list of lengths of sequences within X 155 | (array([[ 87, 225],[ 87, 225], ... [ 87, 225, 62, 127], [ 87, 225, 65, 128]]), [14, 18]) 156 | """ 157 | return self._hmm_data[word] 158 | 159 | 160 | class SinglesData(object): 161 | """ class provides loading and getters for ASL data suitable for use with hmmlearn library 162 | 163 | """ 164 | 165 | def __init__(self, asl:AslDb, csvfile:str, feature_list): 166 | """ loads training data sequences suitable for use with hmmlearn library based on feature_method chosen 167 | 168 | :param asl: ASLdata object 169 | :param csvfile: str 170 | filename of csv file containing word training start and end frame data with expected format: 171 | video,speaker,word,startframe,endframe 172 | :param feature_list: list str of feature labels 173 | """ 174 | self.df = pd.read_csv(csvfile) 175 | self.wordlist = list(self.df['word']) 176 | self.sentences_index = self._load_sentence_word_indices() 177 | self._data = self._load_data(asl, feature_list) 178 | self._hmm_data = create_hmmlearn_data(self._data) 179 | self.num_items = len(self._data) 180 | self.num_sentences = len(self.sentences_index) 181 | 182 | # def _load_data(self, asl, fn, feature_method): 183 | def _load_data(self, asl, feature_list): 184 | """ Consolidates sequenced feature data into a dictionary of words and creates answer list of words in order 185 | of index used for dictionary keys 186 | 187 | :param asl: ASLdata object 188 | :param fn: str 189 | filename of csv file containing word training data 190 | :param feature_method: Feature function 191 | :return: dict 192 | """ 193 | dict = {} 194 | # for each word indexed in the DataFrame 195 | for i in range(len(self.df)): 196 | video = self.df.ix[i,'video'] 197 | new_sequence = [] # list of sample dictionaries for a sequence 198 | for frame in range(self.df.ix[i,'startframe'], self.df.ix[i,'endframe']+1): 199 | vid_frame = video, frame 200 | sample = [asl.df.ix[vid_frame][f] for f in feature_list] 201 | if len(sample) > 0: # dont add if not found 202 | new_sequence.append(sample) 203 | if i in dict: 204 | dict[i].append(new_sequence) # list of sequences 205 | else: 206 | dict[i] = [new_sequence] 207 | return dict 208 | 209 | def _load_sentence_word_indices(self): 210 | """ create dict of video sentence numbers with list of word indices as values 211 | 212 | :return: dict 213 | {v0: [i0, i1, i2], v1: [i0, i1, i2], ... ,} where v# is video number and 214 | i# is index to wordlist, ordered by sentence structure 215 | """ 216 | working_df = self.df.copy() 217 | working_df['idx'] = working_df.index 218 | working_df.sort_values(by='startframe', inplace=True) 219 | p = working_df.pivot('video', 'startframe', 'idx') 220 | p.fillna(-1, inplace=True) 221 | p = p.transpose() 222 | dict = {} 223 | for v in p: 224 | dict[v] = [int(i) for i in p[v] if i>=0] 225 | return dict 226 | 227 | def get_all_sequences(self): 228 | """ getter for entire db of items as series of sequences of feature lists for each frame 229 | 230 | :return: dict 231 | dictionary of lists of feature list sequence lists for each indexed item 232 | {3: [[[87, 225], [87, 225], ...], [[88, 219], [88, 219], ...]]], 233 | ...} 234 | """ 235 | return self._data 236 | 237 | def get_all_Xlengths(self): 238 | """ getter for entire db of items as (X, lengths) tuple for use with hmmlearn library 239 | 240 | :return: dict 241 | dictionary of (X, lengths) tuple, where X is a numpy array of feature lists and lengths is 242 | a list of lengths of sequences within X; should always have only one item in lengths 243 | {3: (array([[ 87, 225],[ 87, 225], ... [ 87, 225, 62, 127], [ 87, 225, 65, 128]]), [14]), 244 | ...} 245 | """ 246 | return self._hmm_data 247 | 248 | def get_item_sequences(self, item:int): 249 | """ getter for single item series of sequences of feature lists for each frame 250 | 251 | :param word: str 252 | :return: list 253 | lists of feature list sequence lists for given word 254 | [[[87, 225], [87, 225], ...]]] 255 | """ 256 | return self._data[item] 257 | 258 | def get_item_Xlengths(self, item:int): 259 | """ getter for single item (X, lengths) tuple for use with hmmlearn library 260 | 261 | :param word: 262 | :return: (list, list) 263 | (X, lengths) tuple, where X is a numpy array of feature lists and lengths is 264 | a list of lengths of sequences within X; lengths should always contain one item 265 | (array([[ 87, 225],[ 87, 225], ... [ 87, 225, 62, 127], [ 87, 225, 65, 128]]), [14]) 266 | """ 267 | return self._hmm_data[item] 268 | 269 | 270 | def combine_sequences(sequences): 271 | ''' 272 | concatenates sequences and return tuple of the new list and lengths 273 | :param sequences: 274 | :return: (list, list) 275 | ''' 276 | sequence_cat = [] 277 | sequence_lengths = [] 278 | # print("num of sequences in {} = {}".format(key, len(sequences))) 279 | for sequence in sequences: 280 | sequence_cat += sequence 281 | num_frames = len(sequence) 282 | sequence_lengths.append(num_frames) 283 | return sequence_cat, sequence_lengths 284 | 285 | def create_hmmlearn_data(dict): 286 | seq_len_dict = {} 287 | for key in dict: 288 | sequences = dict[key] 289 | sequence_cat, sequence_lengths = combine_sequences(sequences) 290 | seq_len_dict[key] = np.array(sequence_cat), sequence_lengths 291 | return seq_len_dict 292 | 293 | if __name__ == '__main__': 294 | asl= AslDb() 295 | print(asl.df.ix[98, 1]) 296 | 297 | 298 | -------------------------------------------------------------------------------- /data/train_words.csv: -------------------------------------------------------------------------------- 1 | video,speaker,word,startframe,endframe 2 | 1,woman-1,JOHN,8,17 3 | 1,woman-1,WRITE,22,50 4 | 1,woman-1,HOMEWORK,51,77 5 | 3,woman-2,IX-1P,4,11 6 | 3,woman-2,SEE,12,20 7 | 3,woman-2,JOHN,20,31 8 | 3,woman-2,YESTERDAY,31,40 9 | 3,woman-2,IX,44,52 10 | 4,woman-1,JOHN,2,13 11 | 4,woman-1,IX-1P,13,18 12 | 4,woman-1,SEE,19,27 13 | 4,woman-1,IX,28,35 14 | 4,woman-1,YESTERDAY,36,47 15 | 5,woman-2,LOVE,12,21 16 | 5,woman-2,MARY,22,41 17 | 5,woman-2,JOHN,42,63 18 | 6,woman-2,LOVE,9,24 19 | 6,woman-2,MARY,25,39 20 | 6,woman-2,JOHN,37,50 21 | 6,woman-2,IX,51,54 22 | 8,man-1,JOHN,10,25 23 | 8,man-1,CAN,28,30 24 | 8,man-1,GO,33,41 25 | 8,man-1,CAN,50,56 26 | 9,woman-2,JOHN,21,29 27 | 9,woman-2,CAN,30,35 28 | 9,woman-2,GO1,35,45 29 | 9,woman-2,CAN,46,59 30 | 10,woman-1,JOHN,15,23 31 | 10,woman-1,CAN,27,30 32 | 10,woman-1,GO,36,41 33 | 10,woman-1,CAN,50,57 34 | 11,woman-2,JOHN,8,16 35 | 11,woman-2,CAN,18,24 36 | 11,woman-2,GO1,25,32 37 | 11,woman-2,CAN,33,48 38 | 13,woman-2,JOHN,7,13 39 | 13,woman-2,CAN,15,20 40 | 13,woman-2,GO1,20,29 41 | 13,woman-2,CAN,33,46 42 | 14,woman-2,JOHN,27,34 43 | 14,woman-2,CAN,38,42 44 | 14,woman-2,GO1,43,50 45 | 14,woman-2,CAN,52,62 46 | 15,woman-2,JOHN,7,20 47 | 15,woman-2,FUTURE,26,30 48 | 15,woman-2,GO2,33,38 49 | 15,woman-2,PARTY,40,48 50 | 15,woman-2,FUTURE,53,57 51 | 16,man-1,JOHN,9,35 52 | 16,man-1,FUTURE1,38,43 53 | 16,man-1,GO,45,49 54 | 16,man-1,IX,50,56 55 | 17,woman-2,JOHN,18,23 56 | 17,woman-2,FUTURE1,24,27 57 | 17,woman-2,GO,30,35 58 | 17,woman-2,IX,37,47 59 | 18,woman-1,JOHN,4,8 60 | 18,woman-1,IX,15,20 61 | 18,woman-1,HIT,21,29 62 | 18,woman-1,BLAME,35,47 63 | 18,woman-1,FRED,54,62 64 | 18,woman-1,IX,64,74 65 | 19,woman-2,JOHN,4,18 66 | 19,woman-2,FISH,21,31 67 | 19,woman-2,WONT,33,38 68 | 19,woman-2,EAT,41,51 69 | 19,woman-2,BUT,57,64 70 | 19,woman-2,CAN,63,67 71 | 19,woman-2,EAT,71,74 72 | 19,woman-2,CHICKEN,74,88 73 | 20,woman-2,JOHN,14,31 74 | 20,woman-2,FISH,38,50 75 | 20,woman-2,WONT,53,58 76 | 20,woman-2,EAT,64,74 77 | 20,woman-2,BUT,79,85 78 | 20,woman-2,CAN,85,90 79 | 20,woman-2,EAT,93,102 80 | 20,woman-2,CHICKEN,104,120 81 | 22,woman-2,VEGETABLE,11,23 82 | 22,woman-2,CHINA,34,46 83 | 22,woman-2,IX,49,54 84 | 22,woman-2,PEOPLE,61,64 85 | 22,woman-2,PREFER,66,69 86 | 22,woman-2,BROCCOLI,74,100 87 | 23,woman-2,VEGETABLE,21,29 88 | 23,woman-2,CHINA,31,41 89 | 23,woman-2,IX,47,55 90 | 23,woman-2,PEOPLE,60,63 91 | 23,woman-2,PREFER,65,70 92 | 23,woman-2,BROCCOLI,76,95 93 | 24,woman-2,JOHN,5,13 94 | 24,woman-2,LIKE,16,19 95 | 24,woman-2,IX,20,30 96 | 24,woman-2,IX,30,37 97 | 24,woman-2,IX,37,52 98 | 26,woman-2,JOHN,10,17 99 | 26,woman-2,LIKE,18,22 100 | 26,woman-2,IX,24,33 101 | 26,woman-2,IX,33,39 102 | 26,woman-2,IX,39,50 103 | 27,woman-2,JOHN,3,10 104 | 27,woman-2,LIKE,11,14 105 | 27,woman-2,IX,15,26 106 | 27,woman-2,IX,26,32 107 | 27,woman-2,IX,32,43 108 | 29,man-1,JOHN,12,21 109 | 29,man-1,LIKE,22,30 110 | 29,man-1,IX,31,40 111 | 29,man-1,IX,41,48 112 | 29,man-1,IX,49,53 113 | 31,man-1,JOHN,7,23 114 | 31,man-1,LEAVE,25,33 115 | 31,man-1,IX,33,40 116 | 32,woman-2,JOHN,6,15 117 | 32,woman-2,LEAVE,15,21 118 | 32,woman-2,IX,21,33 119 | 33,woman-2,JOHN,5,10 120 | 33,woman-2,LEAVE,12,20 121 | 33,woman-2,IX,20,30 122 | 34,woman-2,JOHN,5,13 123 | 34,woman-2,SAY,15,18 124 | 34,woman-2,MARY,20,31 125 | 34,woman-2,IX-1P,37,41 126 | 34,woman-2,BUY,44,50 127 | 34,woman-2,HOUSE,52,66 128 | 35,man-1,MARY,18,33 129 | 35,man-1,VEGETABLE,50,62 130 | 35,man-1,KNOW,69,74 131 | 35,man-1,IX,75,77 132 | 35,man-1,LIKE,79,84 133 | 35,man-1,CORN,89,104 134 | 37,woman-2,MARY,7,17 135 | 37,woman-2,VEGETABLE,24,40 136 | 37,woman-2,IX-1P,41,47 137 | 37,woman-2,KNOW,51,55 138 | 37,woman-2,IX,55,59 139 | 37,woman-2,LIKE,62,67 140 | 37,woman-2,CORN1,70,92 141 | 38,woman-2,MARY,6,22 142 | 38,woman-2,VEGETABLE,26,41 143 | 38,woman-2,KNOW,43,51 144 | 38,woman-2,IX,51,57 145 | 38,woman-2,LIKE,59,63 146 | 38,woman-2,CORN1,67,88 147 | 39,man-1,JOHN,9,23 148 | 39,man-1,IX,28,35 149 | 39,man-1,THINK,37,47 150 | 39,man-1,MARY,46,60 151 | 39,man-1,LOVE,63,77 152 | 41,woman-2,JOHN,7,13 153 | 41,woman-2,IX,14,21 154 | 41,woman-2,THINK,25,32 155 | 41,woman-2,MARY,34,41 156 | 41,woman-2,LOVE,43,64 157 | 42,woman-1,JOHN,6,15 158 | 42,woman-1,CAN,19,23 159 | 42,woman-1,BUY,23,29 160 | 42,woman-1,HOUSE,32,46 161 | 44,woman-2,JOHN,7,13 162 | 44,woman-2,FUTURE,14,22 163 | 44,woman-2,BUY,25,30 164 | 44,woman-2,HOUSE,33,45 165 | 45,woman-1,JOHN,13,18 166 | 45,woman-1,FUTURE,21,26 167 | 45,woman-1,NOT,27,33 168 | 45,woman-1,BUY,36,40 169 | 45,woman-1,HOUSE,42,56 170 | 46,woman-1,JOHN,10,14 171 | 46,woman-1,PAST,16,28 172 | 46,woman-1,LIVE,32,37 173 | 46,woman-1,CHICAGO,40,50 174 | 47,woman-2,JOHN,6,10 175 | 47,woman-2,BUY,12,21 176 | 47,woman-2,CAR,21,30 177 | 47,woman-2,FUTURE,31,41 178 | 48,woman-1,JOHN,5,12 179 | 48,woman-1,BUY,17,24 180 | 48,woman-1,CAR,27,35 181 | 48,woman-1,FUTURE,38,52 182 | 49,woman-1,JOHN,5,10 183 | 49,woman-1,BUY,13,19 184 | 49,woman-1,CAR,21,29 185 | 49,woman-1,SHOULD,29,44 186 | 51,woman-1,JOHN,5,13 187 | 51,woman-1,BUY,17,23 188 | 51,woman-1,CAR,25,33 189 | 51,woman-1,FUTURE,34,37 190 | 51,woman-1,NOT,39,47 191 | 52,woman-1,JOHN,5,13 192 | 52,woman-1,SHOULD,14,19 193 | 52,woman-1,NOT,20,30 194 | 52,woman-1,BUY,32,37 195 | 52,woman-1,HOUSE,40,55 196 | 53,woman-2,JOHN,5,12 197 | 53,woman-2,SHOULD,13,21 198 | 53,woman-2,NOT,21,26 199 | 53,woman-2,BUY,29,33 200 | 53,woman-2,HOUSE,36,49 201 | 55,woman-2,JOHN,6,12 202 | 55,woman-2,SHOULD,13,23 203 | 55,woman-2,BUY,29,34 204 | 55,woman-2,HOUSE,35,49 205 | 56,man-1,JOHN,10,21 206 | 56,man-1,DECIDE,27,36 207 | 56,man-1,VISIT,37,46 208 | 56,man-1,MARY,47,55 209 | 58,man-1,JOHN,9,34 210 | 58,man-1,DECIDE,34,45 211 | 58,man-1,VISIT,47,58 212 | 58,man-1,MARY,58,66 213 | 59,woman-1,JOHN,3,7 214 | 59,woman-1,FUTURE,9,16 215 | 59,woman-1,NOT,19,24 216 | 59,woman-1,BUY,27,31 217 | 59,woman-1,HOUSE,34,49 218 | 60,woman-1,JOHN,5,10 219 | 60,woman-1,PREFER,10,23 220 | 60,woman-1,GO,23,31 221 | 60,woman-1,MOVIE,34,47 222 | 61,woman-1,JOHN,2,6 223 | 61,woman-1,WANT,12,21 224 | 61,woman-1,SELL,20,30 225 | 61,woman-1,CAR,30,46 226 | 62,woman-1,JOHN,1,7 227 | 62,woman-1,WANT,10,15 228 | 62,woman-1,SELL,15,22 229 | 62,woman-1,CAR,24,32 230 | 62,woman-1,FUTURE,34,51 231 | 63,woman-1,FUTURE,6,11 232 | 63,woman-1,JOHN,13,21 233 | 63,woman-1,GO,23,29 234 | 63,woman-1,FUTURE,32,43 235 | 64,woman-1,JOHN,5,14 236 | 64,woman-1,PREFER,14,23 237 | 64,woman-1,GO,25,30 238 | 64,woman-1,MOVIE,34,39 239 | 64,woman-1,TOMORROW,41,57 240 | 65,woman-1,NEXT-WEEK,8,14 241 | 65,woman-1,JOHN,15,23 242 | 65,woman-1,GO,25,32 243 | 65,woman-1,NEW-YORK,36,52 244 | 66,woman-1,JOHN,2,8 245 | 66,woman-1,FUTURE,10,23 246 | 66,woman-1,NOT,26,33 247 | 66,woman-1,BUY,35,39 248 | 66,woman-1,HOUSE,41,52 249 | 68,woman-2,LAST-WEEK,5,17 250 | 68,woman-2,JOHN,19,29 251 | 68,woman-2,GO,33,43 252 | 68,woman-2,NEW-YORK,45,58 253 | 69,woman-2,JOHN,4,11 254 | 69,woman-2,WILL,13,17 255 | 69,woman-2,VISIT,19,30 256 | 69,woman-2,MARY,30,44 257 | 70,woman-2,JOHN,3,11 258 | 70,woman-2,WILL,11,18 259 | 70,woman-2,VISIT,20,28 260 | 70,woman-2,MARY,29,40 261 | 72,woman-2,JOHN,3,11 262 | 72,woman-2,FINISH,13,20 263 | 72,woman-2,VISIT,20,31 264 | 72,woman-2,MARY,32,45 265 | 73,man-1,JOHN,2,13 266 | 73,man-1,NOT,13,24 267 | 73,man-1,VISIT,26,38 268 | 73,man-1,MARY,41,54 269 | 75,woman-2,JOHN,3,10 270 | 75,woman-2,FUTURE,10,19 271 | 75,woman-2,FINISH,19,25 272 | 75,woman-2,VISIT,29,42 273 | 75,woman-2,MARY,43,53 274 | 76,woman-2,ANN,2,14 275 | 76,woman-2,BLAME,14,26 276 | 76,woman-2,MARY,26,45 277 | 78,woman-2,JOHN,4,12 278 | 78,woman-2,FUTURE,12,22 279 | 78,woman-2,FINISH,22,26 280 | 78,woman-2,SEE,30,36 281 | 78,woman-2,MARY,38,57 282 | 79,woman-2,JOHN,4,11 283 | 79,woman-2,SHOULD,11,17 284 | 79,woman-2,FINISH,17,24 285 | 79,woman-2,READ,30,35 286 | 79,woman-2,BOOK,39,44 287 | 80,woman-2,JOHN,4,18 288 | 80,woman-2,FUTURE,20,26 289 | 80,woman-2,FINISH,26,32 290 | 80,woman-2,READ,35,41 291 | 80,woman-2,BOOK,46,51 292 | 81,woman-2,JOHN,4,10 293 | 81,woman-2,LIKE,13,19 294 | 81,woman-2,CHOCOLATE,19,35 295 | 82,woman-1,JOHN,7,14 296 | 82,woman-1,SHOULD,16,25 297 | 82,woman-1,NOT,26,31 298 | 82,woman-1,LIKE,31,37 299 | 82,woman-1,CHOCOLATE,37,56 300 | 83,woman-1,IX-1P,6,11 301 | 83,woman-1,FIND,11,14 302 | 83,woman-1,SOMETHING-ONE,15,24 303 | 83,woman-1,POSS,24,37 304 | 83,woman-1,BOOK,40,46 305 | 85,woman-1,JOHN,10,18 306 | 85,woman-1,BROTHER,25,33 307 | 85,woman-1,ARRIVE,37,45 308 | 86,woman-1,SOMETHING-ONE,7,15 309 | 86,woman-1,ARRIVE,18,24 310 | 86,woman-1,HERE,25,35 311 | 87,woman-2,JOHN,7,19 312 | 87,woman-2,GIVE,19,31 313 | 87,woman-2,IX,31,46 314 | 87,woman-2,MAN,46,58 315 | 87,woman-2,IX,58,73 316 | 87,woman-2,NEW,89,100 317 | 87,woman-2,COAT,100,119 318 | 88,woman-2,JOHN,7,18 319 | 88,woman-2,GIVE,20,40 320 | 88,woman-2,IX,40,51 321 | 88,woman-2,MAN,54,66 322 | 88,woman-2,NEW,68,76 323 | 88,woman-2,COAT,77,93 324 | 91,woman-2,JOHN,4,9 325 | 91,woman-2,IX,12,22 326 | 91,woman-2,GIVE,22,37 327 | 91,woman-2,IX,37,47 328 | 91,woman-2,WOMAN,47,68 329 | 91,woman-2,IX,68,95 330 | 91,woman-2,BOOK,99,105 331 | 93,man-1,JOHN,13,36 332 | 93,man-1,GIVE1,40,57 333 | 93,man-1,IX,59,76 334 | 93,man-1,SOMETHING-ONE,76,87 335 | 93,man-1,WOMAN,88,99 336 | 93,man-1,BOOK,102,112 337 | 94,woman-2,JOHN,4,15 338 | 94,woman-2,GIVE1,15,29 339 | 94,woman-2,IX,33,51 340 | 94,woman-2,SOMETHING-ONE,51,60 341 | 94,woman-2,WOMAN,64,74 342 | 94,woman-2,BOOK,78,85 343 | 95,man-1,IX,14,34 344 | 95,man-1,WOMAN,34,52 345 | 95,man-1,HAVE,52,60 346 | 95,man-1,BOOK,61,68 347 | 96,man-1,IX,13,29 348 | 96,man-1,HAVE,29,39 349 | 96,man-1,BOOK,42,49 350 | 97,woman-2,FRANK,7,24 351 | 97,woman-2,POSS,24,32 352 | 97,woman-2,NEW,35,44 353 | 97,woman-2,CAR,44,50 354 | 97,woman-2,BREAK-DOWN,50,66 355 | 98,woman-1,POSS,16,22 356 | 98,woman-1,FRANK,22,35 357 | 98,woman-1,NEW,39,48 358 | 98,woman-1,CAR,50,60 359 | 98,woman-1,BREAK-DOWN,62,75 360 | 99,woman-1,FRANK,11,23 361 | 99,woman-1,POSS,25,33 362 | 99,woman-1,CAR,39,46 363 | 99,woman-1,BREAK-DOWN,47,61 364 | 101,woman-1,POSS,23,33 365 | 101,woman-1,NEW,36,46 366 | 101,woman-1,CAR,48,54 367 | 101,woman-1,BREAK-DOWN,57,69 368 | 102,woman-2,JOHN,4,13 369 | 102,woman-2,SEARCH-FOR,18,33 370 | 102,woman-2,WHO,32,42 371 | 102,woman-2,POSS,44,49 372 | 102,woman-2,BOOK,52,58 373 | 102,woman-2,WHAT,59,79 374 | 103,woman-1,JOHN,4,12 375 | 103,woman-1,POSS,15,20 376 | 103,woman-1,BROTHER,20,36 377 | 103,woman-1,ARRIVE,39,50 378 | 104,woman-1,JOHN,8,16 379 | 104,woman-1,POSS,18,25 380 | 104,woman-1,LEG,26,33 381 | 106,man-1,JOHN,20,36 382 | 106,man-1,POSS,39,46 383 | 106,man-1,FRIEND,48,65 384 | 106,man-1,HAVE,65,71 385 | 106,man-1,CANDY,73,84 386 | 109,woman-2,WOMAN,19,30 387 | 109,woman-2,ARRIVE,31,35 388 | 109,woman-2,HERE,36,43 389 | 110,woman-2,WHAT,5,19 390 | 110,woman-2,SOMETHING-ONE,21,27 391 | 110,woman-2,WHAT,30,34 392 | 110,woman-2,WOMAN,42,52 393 | 110,woman-2,ARRIVE,54,60 394 | 111,woman-2,WHAT,10,22 395 | 111,woman-2,WOMAN,27,34 396 | 111,woman-2,WHAT,37,47 397 | 111,woman-2,ARRIVE,52,62 398 | 111,woman-2,HERE,64,69 399 | 112,man-1,IX,17,24 400 | 112,man-1,CAR,27,36 401 | 112,man-1,BLUE,40,55 402 | 112,man-1,SUE,56,63 403 | 112,man-1,BUY1,64,79 404 | 114,man-1,SOMETHING-ONE,16,27 405 | 114,man-1,POSS,30,34 406 | 114,man-1,CAR,37,46 407 | 114,man-1,STOLEN,54,74 408 | 115,woman-2,SOMETHING-ONE,6,19 409 | 115,woman-2,CAR,21,28 410 | 115,woman-2,STOLEN,34,49 411 | 116,man-1,JOHN,0,14 412 | 116,man-1,POSS,16,21 413 | 116,man-1,OLD,23,31 414 | 116,man-1,HOUSE,33,41 415 | 116,man-1,SELL,42,50 416 | 116,man-1,YESTERDAY,53,66 417 | 117,man-1,SUE,14,23 418 | 117,man-1,BUY1,26,31 419 | 117,man-1,IX,34,40 420 | 117,man-1,CAR,40,46 421 | 117,man-1,BLUE,46,53 422 | 118,man-1,SUE,9,27 423 | 118,man-1,BUY1,31,38 424 | 118,man-1,IX,45,54 425 | 118,man-1,CAR,56,64 426 | 118,man-1,BLUE,66,72 427 | 120,man-1,SUE,8,19 428 | 120,man-1,BUY1,22,30 429 | 120,man-1,CAR,33,40 430 | 120,man-1,BLUE,39,44 431 | 121,woman-2,JOHN,6,18 432 | 121,woman-2,READ,21,28 433 | 121,woman-2,BOOK,30,37 434 | 123,woman-1,SOMETHING-ONE,7,15 435 | 123,woman-1,STUDENT,18,25 436 | 123,woman-1,HAVE,26,33 437 | 123,woman-1,VIDEOTAPE,35,52 438 | 124,man-1,SOMETHING-ONE,23,30 439 | 124,man-1,STUDENT,33,41 440 | 124,man-1,SOMETHING-ONE,49,61 441 | 124,man-1,BORROW,70,75 442 | 124,man-1,VIDEOTAPE,78,93 443 | 125,woman-1,STUDENT,4,13 444 | 125,woman-1,HAVE,14,24 445 | 125,woman-1,VIDEOTAPE,30,45 446 | 126,woman-1,JOHN,7,14 447 | 126,woman-1,MOTHER,17,22 448 | 126,woman-1,ARRIVE,30,36 449 | 126,woman-1,WHO,43,50 450 | 126,woman-1,MOTHER,53,60 451 | 126,woman-1,ARRIVE,63,73 452 | 127,man-1,JOHN,29,38 453 | 127,man-1,SEE,39,44 454 | 127,man-1,WHO,45,55 455 | 127,man-1,WHAT,61,72 456 | 128,woman-1,IX,6,12 457 | 128,woman-1,JOHN,12,19 458 | 128,woman-1,SEE,20,27 459 | 128,woman-1,WHO,27,38 460 | 128,woman-1,IX,38,43 461 | 128,woman-1,WHO,43,58 462 | 129,man-1,WHAT,22,31 463 | 129,man-1,MARY,35,45 464 | 129,man-1,WONT,46,56 465 | 129,man-1,EAT,58,64 466 | 129,man-1,WHAT,69,77 467 | 130,man-1,JOHN,23,35 468 | 130,man-1,MARY,41,51 469 | 130,man-1,LOVE,53,61 470 | 130,man-1,IX,62,66 471 | 131,man-1,JOHN,26,42 472 | 131,man-1,IX,49,56 473 | 131,man-1,LIKE,56,64 474 | 131,man-1,MARY,67,81 475 | 132,man-1,WHO,21,34 476 | 132,man-1,VEGETABLE,38,50 477 | 132,man-1,PREFER,54,62 478 | 132,man-1,POTATO,66,75 479 | 132,man-1,WHAT,78,95 480 | 133,man-1,WHAT,17,26 481 | 133,man-1,JOHN,28,36 482 | 133,man-1,LOVE,38,46 483 | 133,man-1,WHAT,47,58 484 | 134,woman-1,WHAT,13,17 485 | 134,woman-1,JOHN,18,25 486 | 134,woman-1,LOVE,27,34 487 | 134,woman-1,WHAT,40,51 488 | 135,woman-1,WHAT,6,10 489 | 135,woman-1,JOHN,11,19 490 | 135,woman-1,LOVE,20,28 491 | 135,woman-1,WHAT,32,42 492 | 136,woman-1,WHO,15,17 493 | 136,woman-1,TELL,19,26 494 | 136,woman-1,BILL,28,39 495 | 136,woman-1,MARY,45,62 496 | 137,man-1,JOHN,5,12 497 | 137,man-1,BUY1,14,21 498 | 137,man-1,WHAT,23,29 499 | 137,man-1,YESTERDAY,33,41 500 | 137,man-1,BOOK,46,62 501 | 138,woman-1,JOHN,5,10 502 | 138,woman-1,BUY,16,23 503 | 138,woman-1,WHAT,25,33 504 | 138,woman-1,YESTERDAY,35,50 505 | 138,woman-1,BOOK,60,74 506 | 140,woman-1,JOHN,5,9 507 | 140,woman-1,BUY,14,19 508 | 140,woman-1,YESTERDAY,21,30 509 | 140,woman-1,WHAT,34,48 510 | 140,woman-1,BOOK,55,67 511 | 141,woman-1,JOHN,4,10 512 | 141,woman-1,BUY,15,24 513 | 141,woman-1,WHAT,28,41 514 | 141,woman-1,BOOK,49,62 515 | 143,man-1,TELL,7,14 516 | 143,man-1,BILL,15,23 517 | 143,man-1,YESTERDAY,26,34 518 | 143,man-1,WHO,36,47 519 | 143,man-1,MARY,51,64 520 | 144,woman-1,TELL,11,15 521 | 144,woman-1,BILL,16,24 522 | 144,woman-1,YESTERDAY,26,39 523 | 144,woman-1,WHO,41,57 524 | 144,woman-1,MARY,57,73 525 | 145,woman-1,WHO,11,14 526 | 145,woman-1,TELL,14,20 527 | 145,woman-1,BILL,21,27 528 | 145,woman-1,WHAT,34,46 529 | 145,woman-1,MARY,50,64 530 | 146,woman-1,WHO,9,14 531 | 146,woman-1,JOHN,15,21 532 | 146,woman-1,VISIT,23,32 533 | 146,woman-1,WHAT,36,46 534 | 146,woman-1,MARY,48,62 535 | 147,woman-1,TELL,8,17 536 | 147,woman-1,BILL,19,31 537 | 147,woman-1,YESTERDAY,31,45 538 | 147,woman-1,WHO,50,62 539 | 147,woman-1,MARY,66,77 540 | 148,man-1,JOHN,9,18 541 | 148,man-1,BUY1,20,25 542 | 148,man-1,YESTERDAY,31,37 543 | 148,man-1,WHAT,39,48 544 | 148,man-1,BOOK,53,63 545 | 149,man-1,WHO,13,19 546 | 149,man-1,JOHN,19,29 547 | 149,man-1,SEE,31,36 548 | 149,man-1,WHAT,40,51 549 | 149,man-1,MARY,55,70 550 | 150,woman-1,JOHN,1,9 551 | 150,woman-1,SEE,9,17 552 | 150,woman-1,THROW,20,33 553 | 150,woman-1,APPLE,39,43 554 | 150,woman-1,WHO,46,55 555 | 150,woman-1,MARY,57,72 556 | 151,woman-1,JOHN,5,11 557 | 151,woman-1,SEE,12,22 558 | 151,woman-1,THROW,26,39 559 | 151,woman-1,APPLE,42,52 560 | 151,woman-1,WHO,56,64 561 | 151,woman-1,MARY,66,77 562 | 152,man-1,JOHN,12,19 563 | 152,man-1,SEE,22,35 564 | 152,man-1,THROW,37,48 565 | 152,man-1,APPLE,50,59 566 | 152,man-1,WHO,60,73 567 | 152,man-1,MARY,77,90 568 | 153,man-1,ARRIVE,14,20 569 | 153,man-1,WHO,26,40 570 | 154,woman-1,ARRIVE,13,21 571 | 154,woman-1,WHO,27,41 572 | 155,man-1,JOHN,16,29 573 | 155,man-1,GO,30,38 574 | 155,man-1,SHOULD,40,51 575 | 155,man-1,SHOULD,56,63 576 | 156,woman-1,JOHN,8,14 577 | 156,woman-1,GO,16,29 578 | 156,woman-1,SHOULD,29,41 579 | 157,man-1,JOHN,5,16 580 | 157,man-1,SHOULD,18,24 581 | 157,man-1,GO,25,32 582 | 157,man-1,SHOULD,35,42 583 | 157,man-1,SHOULD,49,55 584 | 159,man-1,LOVE,15,20 585 | 159,man-1,JOHN,22,29 586 | 159,man-1,WHO,31,46 587 | 160,woman-1,LOVE,10,16 588 | 160,woman-1,JOHN,20,29 589 | 160,woman-1,WHO,34,46 590 | 161,woman-1,WHO,15,20 591 | 161,woman-1,JOHN,20,25 592 | 161,woman-1,LOVE,28,34 593 | 161,woman-1,WHO,41,51 594 | 162,woman-1,WHAT,8,14 595 | 162,woman-1,JOHN,15,19 596 | 162,woman-1,LIKE,22,33 597 | 163,woman-1,WHO,22,27 598 | 163,woman-1,ARRIVE,31,44 599 | 164,woman-1,NAME,14,30 600 | 165,man-1,SHOOT,29,45 601 | 165,man-1,FRANK,48,60 602 | 166,man-1,JOHN,13,24 603 | 166,man-1,IX,29,35 604 | 166,man-1,SAY-1P,36,43 605 | 166,man-1,LOVE,44,54 606 | 166,man-1,MARY,59,70 607 | 168,man-1,JOHN,19,36 608 | 168,man-1,BLAME,40,50 609 | 169,man-1,MARY,18,31 610 | 169,man-1,BILL,37,48 611 | 169,man-1,SAY,51,57 612 | 169,man-1,JOHN,57,67 613 | 169,man-1,LOVE,70,87 614 | 170,man-1,JOHN,10,24 615 | 170,man-1,MARY,23,35 616 | 170,man-1,BLAME,38,48 617 | 172,man-1,MARY,18,34 618 | 172,man-1,SELF,41,49 619 | 172,man-1,PREFER,51,58 620 | 172,man-1,CORN,59,74 621 | 173,man-1,MARY,8,21 622 | 173,man-1,SELF,23,30 623 | 173,man-1,PREFER,33,39 624 | 173,man-1,CORN,40,55 625 | 175,man-1,PEOPLE,18,32 626 | 175,man-1,GROUP,35,43 627 | 175,man-1,GIVE1,46,71 628 | 175,man-1,JANA,74,91 629 | 175,man-1,TOY1,95,112 630 | 176,man-1,MANY,24,32 631 | 176,man-1,PEOPLE,35,45 632 | 176,man-1,GIVE1,47,70 633 | 176,man-1,JANA,75,89 634 | 176,man-1,TOY,94,106 635 | 177,man-1,JOHN,11,34 636 | 177,man-1,LOVE,42,54 637 | 177,man-1,MARY,57,66 638 | 178,man-1,JOHN,9,20 639 | 178,man-1,ARRIVE,25,38 640 | 179,man-1,JOHN,7,27 641 | 179,man-1,ARRIVE,32,48 642 | 180,man-1,JOHN,9,24 643 | 180,man-1,ARRIVE,28,43 644 | 182,man-1,ALL,40,51 645 | 182,man-1,BOY,54,59 646 | 182,man-1,GIVE1,73,93 647 | 182,man-1,TEACHER,98,108 648 | 182,man-1,APPLE,112,122 649 | 183,man-1,ALL,14,24 650 | 183,man-1,BOY,27,37 651 | 183,man-1,GIVE1,41,68 652 | 183,man-1,TEACHER,70,80 653 | 183,man-1,APPLE,86,96 654 | 185,man-1,IX,13,19 655 | 185,man-1,GIRL,23,32 656 | 185,man-1,GIVE,34,46 657 | 185,man-1,JOHN,49,57 658 | 185,man-1,BOX,60,77 659 | 186,man-1,IX,8,17 660 | 186,man-1,GIRL,21,27 661 | 186,man-1,GIVE2,30,40 662 | 186,man-1,JOHN,40,49 663 | 186,man-1,BOX,51,66 664 | 187,man-1,TEACHER,10,21 665 | 187,man-1,GIVE,24,31 666 | 187,man-1,GIVE,38,43 667 | 187,man-1,GIVE,49,55 668 | 187,man-1,BOY,60,66 669 | 187,man-1,BOOK,70,77 670 | 188,man-1,TEACHER,21,42 671 | 188,man-1,GIVE3,58,76 672 | 188,man-1,BOY,83,90 673 | 188,man-1,BOOK,95,104 674 | 190,man-1,IX,5,15 675 | 190,man-1,GIRL,19,28 676 | 190,man-1,GIVE,30,38 677 | 190,man-1,JOHN,39,50 678 | 190,man-1,BOX,52,70 679 | 191,man-1,JOHN,2,9 680 | 191,man-1,GIVE,10,24 681 | 191,man-1,GIRL,28,37 682 | 191,man-1,BOX,37,53 683 | 192,man-1,JOHN,26,41 684 | 192,man-1,GIVE3,44,63 685 | 192,man-1,GIRL,67,74 686 | 192,man-1,BOX,78,93 687 | 194,woman-1,JOHN,1,17 688 | 194,woman-1,CAN,21,25 689 | 194,woman-1,GET,26,33 690 | 194,woman-1,CAN,36,47 691 | 195,woman-2,JOHN,7,14 692 | 195,woman-2,NOT,17,20 693 | 195,woman-2,VISIT,22,29 694 | 195,woman-2,MARY,30,41 695 | 195,woman-2,VISIT,58,67 696 | 195,woman-2,MOTHER,68,73 697 | 195,woman-2,IX,75,81 698 | 196,man-1,IX,18,26 699 | 196,man-1,BOOK,28,35 700 | 196,man-1,PUTASIDE,37,53 701 | 197,man-1,JOHN,8,16 702 | 197,man-1,ARRIVE,18,24 703 | 197,man-1,NOT,28,35 704 | 197,man-1,WHAT,37,49 705 | 198,man-1,LIKE,8,14 706 | 198,man-1,CHOCOLATE,19,30 707 | 198,man-1,WHO,34,47 708 | 200,woman-1,JOHN,5,12 709 | 200,woman-1,VISIT,14,21 710 | 200,woman-1,WHO,26,38 711 | 200,woman-1,WHAT,42,56 712 | -------------------------------------------------------------------------------- /asl_recognizer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Artificial Intelligence Engineer Nanodegree - Probabilistic Models\n", 10 | "## Project: Sign Language Recognition System\n", 11 | "- [Introduction](#intro)\n", 12 | "- [Part 1 Feature Selection](#part1_tutorial)\n", 13 | " - [Tutorial](#part1_tutorial)\n", 14 | " - [Features Submission](#part1_submission)\n", 15 | " - [Features Unittest](#part1_test)\n", 16 | "- [Part 2 Train the models](#part2_tutorial)\n", 17 | " - [Tutorial](#part2_tutorial)\n", 18 | " - [Model Selection Score Submission](#part2_submission)\n", 19 | " - [Model Score Unittest](#part2_test)\n", 20 | "- [Part 3 Build a Recognizer](#part3_tutorial)\n", 21 | " - [Tutorial](#part3_tutorial)\n", 22 | " - [Recognizer Submission](#part3_submission)\n", 23 | " - [Recognizer Unittest](#part3_test)\n", 24 | "- [Part 4 (OPTIONAL) Improve the WER with Language Models](#part4_info)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "\n", 32 | "## Introduction\n", 33 | "The overall goal of this project is to build a word recognizer for American Sign Language video sequences, demonstrating the power of probabalistic models. In particular, this project employs [hidden Markov models (HMM's)](https://en.wikipedia.org/wiki/Hidden_Markov_model) to analyze a series of measurements taken from videos of American Sign Language (ASL) collected for research (see the [RWTH-BOSTON-104 Database](http://www-i6.informatik.rwth-aachen.de/~dreuw/database-rwth-boston-104.php)). In this video, the right-hand x and y locations are plotted as the speaker signs the sentence.\n", 34 | "[![ASLR demo](http://www-i6.informatik.rwth-aachen.de/~dreuw/images/demosample.png)](https://drive.google.com/open?id=0B_5qGuFe-wbhUXRuVnNZVnMtam8)\n", 35 | "\n", 36 | "The raw data, train, and test sets are pre-defined. You will derive a variety of feature sets (explored in Part 1), as well as implement three different model selection criterion to determine the optimal number of hidden states for each word model (explored in Part 2). Finally, in Part 3 you will implement the recognizer and compare the effects the different combinations of feature sets and model selection criteria. \n", 37 | "\n", 38 | "At the end of each Part, complete the submission cells with implementations, answer all questions, and pass the unit tests. Then submit the completed notebook for review!" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "\n", 46 | "## PART 1: Data\n", 47 | "\n", 48 | "### Features Tutorial\n", 49 | "##### Load the initial database\n", 50 | "A data handler designed for this database is provided in the student codebase as the `AslDb` class in the `asl_data` module. This handler creates the initial [pandas](http://pandas.pydata.org/pandas-docs/stable/) dataframe from the corpus of data included in the `data` directory as well as dictionaries suitable for extracting data in a format friendly to the [hmmlearn](https://hmmlearn.readthedocs.io/en/latest/) library. We'll use those to create models in Part 2.\n", 51 | "\n", 52 | "To start, let's set up the initial database and select an example set of features for the training set. At the end of Part 1, you will create additional feature sets for experimentation. " 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "import numpy as np\n", 64 | "import pandas as pd\n", 65 | "from asl_data import AslDb\n", 66 | "\n", 67 | "\n", 68 | "asl = AslDb() # initializes the database\n", 69 | "asl.df.head() # displays the first five rows of the asl database, indexed by video and frame" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "asl.df.ix[98,1] # look at the data available for an individual frame" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "The frame represented by video 98, frame 1 is shown here:\n", 88 | "![Video 98](http://www-i6.informatik.rwth-aachen.de/~dreuw/database/rwth-boston-104/overview/images/orig/098-start.jpg)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "##### Feature selection for training the model\n", 96 | "The objective of feature selection when training a model is to choose the most relevant variables while keeping the model as simple as possible, thus reducing training time. We can use the raw features already provided or derive our own and add columns to the pandas dataframe `asl.df` for selection. As an example, in the next cell a feature named `'grnd-ry'` is added. This feature is the difference between the right-hand y value and the nose y value, which serves as the \"ground\" right y value. " 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "asl.df['grnd-ry'] = asl.df['right-y'] - asl.df['nose-y']\n", 108 | "asl.df.head() # the new feature 'grnd-ry' is now in the frames dictionary" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "##### Try it!" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "from asl_utils import test_features_tryit\n", 127 | "# TODO add df columns for 'grnd-rx', 'grnd-ly', 'grnd-lx' representing differences between hand and nose locations\n", 128 | "\n", 129 | "# test the code\n", 130 | "test_features_tryit(asl)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "# collect the features into a list\n", 142 | "features_ground = ['grnd-rx','grnd-ry','grnd-lx','grnd-ly']\n", 143 | " #show a single set of features for a given (video, frame) tuple\n", 144 | "[asl.df.ix[98,1][v] for v in features_ground]" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "##### Build the training set\n", 152 | "Now that we have a feature list defined, we can pass that list to the `build_training` method to collect the features for all the words in the training set. Each word in the training set has multiple examples from various videos. Below we can see the unique words that have been loaded into the training set:" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "training = asl.build_training(features_ground)\n", 164 | "print(\"Training words: {}\".format(training.words))" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "The training data in `training` is an object of class `WordsData` defined in the `asl_data` module. in addition to the `words` list, data can be accessed with the `get_all_sequences`, `get_all_Xlengths`, `get_word_sequences`, and `get_word_Xlengths` methods. We need the `get_word_Xlengths` method to train multiple sequences with the `hmmlearn` library. In the following example, notice that there are two lists; the first is a concatenation of all the sequences(the X portion) and the second is a list of the sequence lengths(the Lengths portion)." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "training.get_word_Xlengths('CHOCOLATE')" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "###### More feature sets\n", 190 | "So far we have a simple feature set that is enough to get started modeling. However, we might get better results if we manipulate the raw values a bit more, so we will go ahead and set up some other options now for experimentation later. For example, we could normalize each speaker's range of motion with grouped statistics using [Pandas stats](http://pandas.pydata.org/pandas-docs/stable/api.html#api-dataframe-stats) functions and [pandas groupby](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html). Below is an example for finding the means of all speaker subgroups." 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "collapsed": false, 198 | "scrolled": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "df_means = asl.df.groupby('speaker').mean()\n", 203 | "df_means" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "To select a mean that matches by speaker, use the pandas [map](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.map.html) method:" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "asl.df['left-x-mean']= asl.df['speaker'].map(df_means['left-x'])\n", 222 | "asl.df.head()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "##### Try it!" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "from asl_utils import test_std_tryit\n", 241 | "# TODO Create a dataframe named `df_std` with standard deviations grouped by speaker\n", 242 | "\n", 243 | "# test the code\n", 244 | "test_std_tryit(df_std)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "\n", 252 | "### Features Implementation Submission\n", 253 | "Implement four feature sets and answer the question that follows.\n", 254 | "- normalized Cartesian coordinates\n", 255 | " - use *mean* and *standard deviation* statistics and the [standard score](https://en.wikipedia.org/wiki/Standard_score) equation to account for speakers with different heights and arm length\n", 256 | " \n", 257 | "- polar coordinates\n", 258 | " - calculate polar coordinates with [Cartesian to polar equations](https://en.wikipedia.org/wiki/Polar_coordinate_system#Converting_between_polar_and_Cartesian_coordinates)\n", 259 | " - use the [np.arctan2](https://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.arctan2.html) function and *swap the x and y axes* to move the $0$ to $2\\pi$ discontinuity to 12 o'clock instead of 3 o'clock; in other words, the normal break in radians value from $0$ to $2\\pi$ occurs directly to the left of the speaker's nose, which may be in the signing area and interfere with results. By swapping the x and y axes, that discontinuity move to directly above the speaker's head, an area not generally used in signing.\n", 260 | "\n", 261 | "- delta difference\n", 262 | " - as described in Thad's lecture, use the difference in values between one frame and the next frames as features\n", 263 | " - pandas [diff method](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.diff.html) and [fillna method](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html) will be helpful for this one\n", 264 | "\n", 265 | "- custom features\n", 266 | " - These are your own design; combine techniques used above or come up with something else entirely. We look forward to seeing what you come up with! \n", 267 | " Some ideas to get you started:\n", 268 | " - normalize using a [feature scaling equation](https://en.wikipedia.org/wiki/Feature_scaling)\n", 269 | " - normalize the polar coordinates\n", 270 | " - adding additional deltas\n" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "collapsed": false 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "# TODO add features for normalized by speaker values of left, right, x, y\n", 282 | "# Name these 'norm-rx', 'norm-ry', 'norm-lx', and 'norm-ly'\n", 283 | "# using Z-score scaling (X-Xmean)/Xstd\n", 284 | "\n", 285 | "features_norm = ['norm-rx', 'norm-ry', 'norm-lx','norm-ly']" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "# TODO add features for polar coordinate values where the nose is the origin\n", 297 | "# Name these 'polar-rr', 'polar-rtheta', 'polar-lr', and 'polar-ltheta'\n", 298 | "# Note that 'polar-rr' and 'polar-rtheta' refer to the radius and angle\n", 299 | "\n", 300 | "features_polar = ['polar-rr', 'polar-rtheta', 'polar-lr', 'polar-ltheta']" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "# TODO add features for left, right, x, y differences by one time step, i.e. the \"delta\" values discussed in the lecture\n", 312 | "# Name these 'delta-rx', 'delta-ry', 'delta-lx', and 'delta-ly'\n", 313 | "\n", 314 | "features_delta = ['delta-rx', 'delta-ry', 'delta-lx', 'delta-ly']" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "# TODO add features of your own design, which may be a combination of the above or something else\n", 326 | "# Name these whatever you would like\n", 327 | "\n", 328 | "# TODO define a list named 'features_custom' for building the training set" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "**Question 1:** What custom features did you choose for the features_custom set and why?\n", 336 | "\n", 337 | "**Answer 1:**" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "\n", 345 | "### Features Unit Testing\n", 346 | "Run the following unit tests as a sanity check on the defined \"ground\", \"norm\", \"polar\", and 'delta\"\n", 347 | "feature sets. The test simply looks for some valid values but is not exhaustive. However, the project should not be submitted if these tests don't pass." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "collapsed": false 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "import unittest\n", 359 | "# import numpy as np\n", 360 | "\n", 361 | "class TestFeatures(unittest.TestCase):\n", 362 | "\n", 363 | " def test_features_ground(self):\n", 364 | " sample = (asl.df.ix[98, 1][features_ground]).tolist()\n", 365 | " self.assertEqual(sample, [9, 113, -12, 119])\n", 366 | "\n", 367 | " def test_features_norm(self):\n", 368 | " sample = (asl.df.ix[98, 1][features_norm]).tolist()\n", 369 | " np.testing.assert_almost_equal(sample, [ 1.153, 1.663, -0.891, 0.742], 3)\n", 370 | "\n", 371 | " def test_features_polar(self):\n", 372 | " sample = (asl.df.ix[98,1][features_polar]).tolist()\n", 373 | " np.testing.assert_almost_equal(sample, [113.3578, 0.0794, 119.603, -0.1005], 3)\n", 374 | "\n", 375 | " def test_features_delta(self):\n", 376 | " sample = (asl.df.ix[98, 0][features_delta]).tolist()\n", 377 | " self.assertEqual(sample, [0, 0, 0, 0])\n", 378 | " sample = (asl.df.ix[98, 18][features_delta]).tolist()\n", 379 | " self.assertTrue(sample in [[-16, -5, -2, 4], [-14, -9, 0, 0]], \"Sample value found was {}\".format(sample))\n", 380 | " \n", 381 | "suite = unittest.TestLoader().loadTestsFromModule(TestFeatures())\n", 382 | "unittest.TextTestRunner().run(suite)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": { 388 | "collapsed": true 389 | }, 390 | "source": [ 391 | "\n", 392 | "## PART 2: Model Selection\n", 393 | "### Model Selection Tutorial\n", 394 | "The objective of Model Selection is to tune the number of states for each word HMM prior to testing on unseen data. In this section you will explore three methods: \n", 395 | "- Log likelihood using cross-validation folds (CV)\n", 396 | "- Bayesian Information Criterion (BIC)\n", 397 | "- Discriminative Information Criterion (DIC) " 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "##### Train a single word\n", 405 | "Now that we have built a training set with sequence data, we can \"train\" models for each word. As a simple starting example, we train a single word using Gaussian hidden Markov models (HMM). By using the `fit` method during training, the [Baum-Welch Expectation-Maximization](https://en.wikipedia.org/wiki/Baum%E2%80%93Welch_algorithm) (EM) algorithm is invoked iteratively to find the best estimate for the model *for the number of hidden states specified* from a group of sample seequences. For this example, we *assume* the correct number of hidden states is 3, but that is just a guess. How do we know what the \"best\" number of states for training is? We will need to find some model selection technique to choose the best parameter." 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": { 412 | "collapsed": false 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "import warnings\n", 417 | "from hmmlearn.hmm import GaussianHMM\n", 418 | "\n", 419 | "def train_a_word(word, num_hidden_states, features):\n", 420 | " \n", 421 | " warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n", 422 | " training = asl.build_training(features) \n", 423 | " X, lengths = training.get_word_Xlengths(word)\n", 424 | " model = GaussianHMM(n_components=num_hidden_states, n_iter=1000).fit(X, lengths)\n", 425 | " logL = model.score(X, lengths)\n", 426 | " return model, logL\n", 427 | "\n", 428 | "demoword = 'BOOK'\n", 429 | "model, logL = train_a_word(demoword, 3, features_ground)\n", 430 | "print(\"Number of states trained in model for {} is {}\".format(demoword, model.n_components))\n", 431 | "print(\"logL = {}\".format(logL))" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "The HMM model has been trained and information can be pulled from the model, including means and variances for each feature and hidden state. The [log likelihood](http://math.stackexchange.com/questions/892832/why-we-consider-log-likelihood-instead-of-likelihood-in-gaussian-distribution) for any individual sample or group of samples can also be calculated with the `score` method." 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": { 445 | "collapsed": false 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "def show_model_stats(word, model):\n", 450 | " print(\"Number of states trained in model for {} is {}\".format(word, model.n_components)) \n", 451 | " variance=np.array([np.diag(model.covars_[i]) for i in range(model.n_components)]) \n", 452 | " for i in range(model.n_components): # for each hidden state\n", 453 | " print(\"hidden state #{}\".format(i))\n", 454 | " print(\"mean = \", model.means_[i])\n", 455 | " print(\"variance = \", variance[i])\n", 456 | " print()\n", 457 | " \n", 458 | "show_model_stats(demoword, model)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "##### Try it!\n", 466 | "Experiment by changing the feature set, word, and/or num_hidden_states values in the next cell to see changes in values. " 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": { 473 | "collapsed": false 474 | }, 475 | "outputs": [], 476 | "source": [ 477 | "my_testword = 'CHOCOLATE'\n", 478 | "model, logL = train_a_word(my_testword, 3, features_ground) # Experiment here with different parameters\n", 479 | "show_model_stats(my_testword, model)\n", 480 | "print(\"logL = {}\".format(logL))" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "##### Visualize the hidden states\n", 488 | "We can plot the means and variances for each state and feature. Try varying the number of states trained for the HMM model and examine the variances. Are there some models that are \"better\" than others? How can you tell? We would like to hear what you think in the classroom online." 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "collapsed": false 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "%matplotlib inline" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": { 506 | "collapsed": false 507 | }, 508 | "outputs": [], 509 | "source": [ 510 | "import math\n", 511 | "from matplotlib import (cm, pyplot as plt, mlab)\n", 512 | "\n", 513 | "def visualize(word, model):\n", 514 | " \"\"\" visualize the input model for a particular word \"\"\"\n", 515 | " variance=np.array([np.diag(model.covars_[i]) for i in range(model.n_components)])\n", 516 | " figures = []\n", 517 | " for parm_idx in range(len(model.means_[0])):\n", 518 | " xmin = int(min(model.means_[:,parm_idx]) - max(variance[:,parm_idx]))\n", 519 | " xmax = int(max(model.means_[:,parm_idx]) + max(variance[:,parm_idx]))\n", 520 | " fig, axs = plt.subplots(model.n_components, sharex=True, sharey=False)\n", 521 | " colours = cm.rainbow(np.linspace(0, 1, model.n_components))\n", 522 | " for i, (ax, colour) in enumerate(zip(axs, colours)):\n", 523 | " x = np.linspace(xmin, xmax, 100)\n", 524 | " mu = model.means_[i,parm_idx]\n", 525 | " sigma = math.sqrt(np.diag(model.covars_[i])[parm_idx])\n", 526 | " ax.plot(x, mlab.normpdf(x, mu, sigma), c=colour)\n", 527 | " ax.set_title(\"{} feature {} hidden state #{}\".format(word, parm_idx, i))\n", 528 | "\n", 529 | " ax.grid(True)\n", 530 | " figures.append(plt)\n", 531 | " for p in figures:\n", 532 | " p.show()\n", 533 | " \n", 534 | "visualize(my_testword, model)" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "##### ModelSelector class\n", 542 | "Review the `SelectorModel` class from the codebase found in the `my_model_selectors.py` module. It is designed to be a strategy pattern for choosing different model selectors. For the project submission in this section, subclass `SelectorModel` to implement the following model selectors. In other words, you will write your own classes/functions in the `my_model_selectors.py` module and run them from this notebook:\n", 543 | "\n", 544 | "- `SelectorCV `: Log likelihood with CV\n", 545 | "- `SelectorBIC`: BIC \n", 546 | "- `SelectorDIC`: DIC\n", 547 | "\n", 548 | "You will train each word in the training set with a range of values for the number of hidden states, and then score these alternatives with the model selector, choosing the \"best\" according to each strategy. The simple case of training with a constant value for `n_components` can be called using the provided `SelectorConstant` subclass as follow:" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": { 555 | "collapsed": false 556 | }, 557 | "outputs": [], 558 | "source": [ 559 | "from my_model_selectors import SelectorConstant\n", 560 | "\n", 561 | "training = asl.build_training(features_ground) # Experiment here with different feature sets defined in part 1\n", 562 | "word = 'VEGETABLE' # Experiment here with different words\n", 563 | "model = SelectorConstant(training.get_all_sequences(), training.get_all_Xlengths(), word, n_constant=3).select()\n", 564 | "print(\"Number of states trained in model for {} is {}\".format(word, model.n_components))" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": {}, 570 | "source": [ 571 | "##### Cross-validation folds\n", 572 | "If we simply score the model with the Log Likelihood calculated from the feature sequences it has been trained on, we should expect that more complex models will have higher likelihoods. However, that doesn't tell us which would have a better likelihood score on unseen data. The model will likely be overfit as complexity is added. To estimate which topology model is better using only the training data, we can compare scores using cross-validation. One technique for cross-validation is to break the training set into \"folds\" and rotate which fold is left out of training. The \"left out\" fold scored. This gives us a proxy method of finding the best model to use on \"unseen data\". In the following example, a set of word sequences is broken into three folds using the [scikit-learn Kfold](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html) class object. When you implement `SelectorCV`, you will use this technique." 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": null, 578 | "metadata": { 579 | "collapsed": false 580 | }, 581 | "outputs": [], 582 | "source": [ 583 | "from sklearn.model_selection import KFold\n", 584 | "\n", 585 | "training = asl.build_training(features_ground) # Experiment here with different feature sets\n", 586 | "word = 'VEGETABLE' # Experiment here with different words\n", 587 | "word_sequences = training.get_word_sequences(word)\n", 588 | "split_method = KFold()\n", 589 | "for cv_train_idx, cv_test_idx in split_method.split(word_sequences):\n", 590 | " print(\"Train fold indices:{} Test fold indices:{}\".format(cv_train_idx, cv_test_idx)) # view indices of the folds" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "**Tip:** In order to run `hmmlearn` training using the X,lengths tuples on the new folds, subsets must be combined based on the indices given for the folds. A helper utility has been provided in the `asl_utils` module named `combine_sequences` for this purpose." 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": {}, 603 | "source": [ 604 | "##### Scoring models with other criterion\n", 605 | "Scoring model topologies with **BIC** balances fit and complexity within the training set for each word. In the BIC equation, a penalty term penalizes complexity to avoid overfitting, so that it is not necessary to also use cross-validation in the selection process. There are a number of references on the internet for this criterion. These [slides](http://www2.imm.dtu.dk/courses/02433/doc/ch6_slides.pdf) include a formula you may find helpful for your implementation.\n", 606 | "\n", 607 | "The advantages of scoring model topologies with **DIC** over BIC are presented by Alain Biem in this [reference](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.58.6208&rep=rep1&type=pdf) (also found [here](https://pdfs.semanticscholar.org/ed3d/7c4a5f607201f3848d4c02dd9ba17c791fc2.pdf)). DIC scores the discriminant ability of a training set for one word against competing words. Instead of a penalty term for complexity, it provides a penalty if model liklihoods for non-matching words are too similar to model likelihoods for the correct word in the word set." 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": {}, 613 | "source": [ 614 | "\n", 615 | "### Model Selection Implementation Submission\n", 616 | "Implement `SelectorCV`, `SelectorBIC`, and `SelectorDIC` classes in the `my_model_selectors.py` module. Run the selectors on the following five words. Then answer the questions about your results.\n", 617 | "\n", 618 | "**Tip:** The `hmmlearn` library may not be able to train or score all models. Implement try/except contructs as necessary to eliminate non-viable models from consideration." 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "metadata": { 625 | "collapsed": false 626 | }, 627 | "outputs": [], 628 | "source": [ 629 | "words_to_train = ['FISH', 'BOOK', 'VEGETABLE', 'FUTURE', 'JOHN']\n", 630 | "import timeit" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "metadata": { 637 | "collapsed": false 638 | }, 639 | "outputs": [], 640 | "source": [ 641 | "# TODO: Implement SelectorCV in my_model_selector.py\n", 642 | "from my_model_selectors import SelectorCV\n", 643 | "\n", 644 | "training = asl.build_training(features_ground) # Experiment here with different feature sets defined in part 1\n", 645 | "sequences = training.get_all_sequences()\n", 646 | "Xlengths = training.get_all_Xlengths()\n", 647 | "for word in words_to_train:\n", 648 | " start = timeit.default_timer()\n", 649 | " model = SelectorCV(sequences, Xlengths, word, \n", 650 | " min_n_components=2, max_n_components=15, random_state = 14).select()\n", 651 | " end = timeit.default_timer()-start\n", 652 | " if model is not None:\n", 653 | " print(\"Training complete for {} with {} states with time {} seconds\".format(word, model.n_components, end))\n", 654 | " else:\n", 655 | " print(\"Training failed for {}\".format(word))" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": { 662 | "collapsed": false 663 | }, 664 | "outputs": [], 665 | "source": [ 666 | "# TODO: Implement SelectorBIC in module my_model_selectors.py\n", 667 | "from my_model_selectors import SelectorBIC\n", 668 | "\n", 669 | "training = asl.build_training(features_ground) # Experiment here with different feature sets defined in part 1\n", 670 | "sequences = training.get_all_sequences()\n", 671 | "Xlengths = training.get_all_Xlengths()\n", 672 | "for word in words_to_train:\n", 673 | " start = timeit.default_timer()\n", 674 | " model = SelectorBIC(sequences, Xlengths, word, \n", 675 | " min_n_components=2, max_n_components=15, random_state = 14).select()\n", 676 | " end = timeit.default_timer()-start\n", 677 | " if model is not None:\n", 678 | " print(\"Training complete for {} with {} states with time {} seconds\".format(word, model.n_components, end))\n", 679 | " else:\n", 680 | " print(\"Training failed for {}\".format(word))" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": null, 686 | "metadata": { 687 | "collapsed": false 688 | }, 689 | "outputs": [], 690 | "source": [ 691 | "# TODO: Implement SelectorDIC in module my_model_selectors.py\n", 692 | "from my_model_selectors import SelectorDIC\n", 693 | "\n", 694 | "training = asl.build_training(features_ground) # Experiment here with different feature sets defined in part 1\n", 695 | "sequences = training.get_all_sequences()\n", 696 | "Xlengths = training.get_all_Xlengths()\n", 697 | "for word in words_to_train:\n", 698 | " start = timeit.default_timer()\n", 699 | " model = SelectorDIC(sequences, Xlengths, word, \n", 700 | " min_n_components=2, max_n_components=15, random_state = 14).select()\n", 701 | " end = timeit.default_timer()-start\n", 702 | " if model is not None:\n", 703 | " print(\"Training complete for {} with {} states with time {} seconds\".format(word, model.n_components, end))\n", 704 | " else:\n", 705 | " print(\"Training failed for {}\".format(word))" 706 | ] 707 | }, 708 | { 709 | "cell_type": "markdown", 710 | "metadata": {}, 711 | "source": [ 712 | "**Question 2:** Compare and contrast the possible advantages and disadvantages of the various model selectors implemented.\n", 713 | "\n", 714 | "**Answer 2:**" 715 | ] 716 | }, 717 | { 718 | "cell_type": "markdown", 719 | "metadata": {}, 720 | "source": [ 721 | "\n", 722 | "### Model Selector Unit Testing\n", 723 | "Run the following unit tests as a sanity check on the implemented model selectors. The test simply looks for valid interfaces but is not exhaustive. However, the project should not be submitted if these tests don't pass." 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": { 730 | "collapsed": false 731 | }, 732 | "outputs": [], 733 | "source": [ 734 | "from asl_test_model_selectors import TestSelectors\n", 735 | "suite = unittest.TestLoader().loadTestsFromModule(TestSelectors())\n", 736 | "unittest.TextTestRunner().run(suite)" 737 | ] 738 | }, 739 | { 740 | "cell_type": "markdown", 741 | "metadata": { 742 | "collapsed": false 743 | }, 744 | "source": [ 745 | "\n", 746 | "## PART 3: Recognizer\n", 747 | "The objective of this section is to \"put it all together\". Using the four feature sets created and the three model selectors, you will experiment with the models and present your results. Instead of training only five specific words as in the previous section, train the entire set with a feature set and model selector strategy. \n", 748 | "### Recognizer Tutorial\n", 749 | "##### Train the full training set\n", 750 | "The following example trains the entire set with the example `features_ground` and `SelectorConstant` features and model selector. Use this pattern for you experimentation and final submission cells.\n", 751 | "\n" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "metadata": { 758 | "collapsed": false 759 | }, 760 | "outputs": [], 761 | "source": [ 762 | "# autoreload for automatically reloading changes made in my_model_selectors and my_recognizer\n", 763 | "%load_ext autoreload\n", 764 | "%autoreload 2\n", 765 | "\n", 766 | "from my_model_selectors import SelectorConstant\n", 767 | "\n", 768 | "def train_all_words(features, model_selector):\n", 769 | " training = asl.build_training(features) # Experiment here with different feature sets defined in part 1\n", 770 | " sequences = training.get_all_sequences()\n", 771 | " Xlengths = training.get_all_Xlengths()\n", 772 | " model_dict = {}\n", 773 | " for word in training.words:\n", 774 | " model = model_selector(sequences, Xlengths, word, \n", 775 | " n_constant=3).select()\n", 776 | " model_dict[word]=model\n", 777 | " return model_dict\n", 778 | "\n", 779 | "models = train_all_words(features_ground, SelectorConstant)\n", 780 | "print(\"Number of word models returned = {}\".format(len(models)))" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "##### Load the test set\n", 788 | "The `build_test` method in `ASLdb` is similar to the `build_training` method already presented, but there are a few differences:\n", 789 | "- the object is type `SinglesData` \n", 790 | "- the internal dictionary keys are the index of the test word rather than the word itself\n", 791 | "- the getter methods are `get_all_sequences`, `get_all_Xlengths`, `get_item_sequences` and `get_item_Xlengths`" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": { 798 | "collapsed": false 799 | }, 800 | "outputs": [], 801 | "source": [ 802 | "test_set = asl.build_test(features_ground)\n", 803 | "print(\"Number of test set items: {}\".format(test_set.num_items))\n", 804 | "print(\"Number of test set sentences: {}\".format(len(test_set.sentences_index)))" 805 | ] 806 | }, 807 | { 808 | "cell_type": "markdown", 809 | "metadata": {}, 810 | "source": [ 811 | "\n", 812 | "### Recognizer Implementation Submission\n", 813 | "For the final project submission, students must implement a recognizer following guidance in the `my_recognizer.py` module. Experiment with the four feature sets and the three model selection methods (that's 12 possible combinations). You can add and remove cells for experimentation or run the recognizers locally in some other way during your experiments, but retain the results for your discussion. For submission, you will provide code cells of **only three** interesting combinations for your discussion (see questions below). At least one of these should produce a word error rate of less than 60%, i.e. WER < 0.60 . \n", 814 | "\n", 815 | "**Tip:** The hmmlearn library may not be able to train or score all models. Implement try/except contructs as necessary to eliminate non-viable models from consideration." 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": null, 821 | "metadata": { 822 | "collapsed": false 823 | }, 824 | "outputs": [], 825 | "source": [ 826 | "# TODO implement the recognize method in my_recognizer\n", 827 | "from my_recognizer import recognize\n", 828 | "from asl_utils import show_errors" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "metadata": { 835 | "collapsed": false 836 | }, 837 | "outputs": [], 838 | "source": [ 839 | "# TODO Choose a feature set and model selector\n", 840 | "features = features_ground # change as needed\n", 841 | "model_selector = SelectorConstant # change as needed\n", 842 | "\n", 843 | "# TODO Recognize the test set and display the result with the show_errors method\n", 844 | "models = train_all_words(features, model_selector)\n", 845 | "test_set = asl.build_test(features)\n", 846 | "probabilities, guesses = recognize(models, test_set)\n", 847 | "show_errors(guesses, test_set)" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": null, 853 | "metadata": { 854 | "collapsed": false 855 | }, 856 | "outputs": [], 857 | "source": [ 858 | "# TODO Choose a feature set and model selector\n", 859 | "# TODO Recognize the test set and display the result with the show_errors method" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": null, 865 | "metadata": { 866 | "collapsed": false 867 | }, 868 | "outputs": [], 869 | "source": [ 870 | "# TODO Choose a feature set and model selector\n", 871 | "# TODO Recognize the test set and display the result with the show_errors method" 872 | ] 873 | }, 874 | { 875 | "cell_type": "markdown", 876 | "metadata": {}, 877 | "source": [ 878 | "**Question 3:** Summarize the error results from three combinations of features and model selectors. What was the \"best\" combination and why? What additional information might we use to improve our WER? For more insight on improving WER, take a look at the introduction to Part 4.\n", 879 | "\n", 880 | "**Answer 3:**" 881 | ] 882 | }, 883 | { 884 | "cell_type": "markdown", 885 | "metadata": {}, 886 | "source": [ 887 | "\n", 888 | "### Recognizer Unit Tests\n", 889 | "Run the following unit tests as a sanity check on the defined recognizer. The test simply looks for some valid values but is not exhaustive. However, the project should not be submitted if these tests don't pass." 890 | ] 891 | }, 892 | { 893 | "cell_type": "code", 894 | "execution_count": null, 895 | "metadata": { 896 | "collapsed": false 897 | }, 898 | "outputs": [], 899 | "source": [ 900 | "from asl_test_recognizer import TestRecognize\n", 901 | "suite = unittest.TestLoader().loadTestsFromModule(TestRecognize())\n", 902 | "unittest.TextTestRunner().run(suite)" 903 | ] 904 | }, 905 | { 906 | "cell_type": "markdown", 907 | "metadata": {}, 908 | "source": [ 909 | "\n", 910 | "## PART 4: (OPTIONAL) Improve the WER with Language Models\n", 911 | "We've squeezed just about as much as we can out of the model and still only get about 50% of the words right! Surely we can do better than that. Probability to the rescue again in the form of [statistical language models (SLM)](https://en.wikipedia.org/wiki/Language_model). The basic idea is that each word has some probability of occurrence within the set, and some probability that it is adjacent to specific other words. We can use that additional information to make better choices.\n", 912 | "\n", 913 | "##### Additional reading and resources\n", 914 | "- [Introduction to N-grams (Stanford Jurafsky slides)](https://web.stanford.edu/class/cs124/lec/languagemodeling.pdf)\n", 915 | "- [Speech Recognition Techniques for a Sign Language Recognition System, Philippe Dreuw et al](https://www-i6.informatik.rwth-aachen.de/publications/download/154/Dreuw--2007.pdf) see the improved results of applying LM on *this* data!\n", 916 | "- [SLM data for *this* ASL dataset](ftp://wasserstoff.informatik.rwth-aachen.de/pub/rwth-boston-104/lm/)\n", 917 | "\n", 918 | "##### Optional challenge\n", 919 | "The recognizer you implemented in Part 3 is equivalent to a \"0-gram\" SLM. Improve the WER with the SLM data provided with the data set in the link above using \"1-gram\", \"2-gram\", and/or \"3-gram\" statistics. The `probabilities` data you've already calculated will be useful and can be turned into a pandas DataFrame if desired (see next cell). \n", 920 | "Good luck! Share your results with the class!" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": null, 926 | "metadata": { 927 | "collapsed": true 928 | }, 929 | "outputs": [], 930 | "source": [ 931 | "# create a DataFrame of log likelihoods for the test word items\n", 932 | "df_probs = pd.DataFrame(data=probabilities)\n", 933 | "df_probs.head()" 934 | ] 935 | } 936 | ], 937 | "metadata": { 938 | "anaconda-cloud": {}, 939 | "kernelspec": { 940 | "display_name": "Python [conda env:aind]", 941 | "language": "python", 942 | "name": "conda-env-aind-py" 943 | }, 944 | "language_info": { 945 | "codemirror_mode": { 946 | "name": "ipython", 947 | "version": 3 948 | }, 949 | "file_extension": ".py", 950 | "mimetype": "text/x-python", 951 | "name": "python", 952 | "nbconvert_exporter": "python", 953 | "pygments_lexer": "ipython3", 954 | "version": "3.5.2" 955 | }, 956 | "nbpresent": { 957 | "slides": { 958 | "0a2d4faf-9fb8-4cee-853b-ed68b90f3f8a": { 959 | "id": "0a2d4faf-9fb8-4cee-853b-ed68b90f3f8a", 960 | "prev": null, 961 | "regions": { 962 | "3fb9ce83-fbb2-4995-832a-f8f400734ad3": { 963 | "attrs": { 964 | "height": 0.8, 965 | "width": 0.8, 966 | "x": 0.1, 967 | "y": 0.1 968 | }, 969 | "content": { 970 | "cell": "1dbb9346-179b-4835-b430-6369d88f1a1b", 971 | "part": "whole" 972 | }, 973 | "id": "3fb9ce83-fbb2-4995-832a-f8f400734ad3" 974 | } 975 | } 976 | }, 977 | "1519a4fa-1588-4644-98de-9c43bf0aceb5": { 978 | "id": "1519a4fa-1588-4644-98de-9c43bf0aceb5", 979 | "prev": "8a712017-49b7-449f-8264-43a032ace902", 980 | "regions": { 981 | "29546121-ed11-44b7-8144-0c44e874098f": { 982 | "attrs": { 983 | "height": 0.8, 984 | "width": 0.8, 985 | "x": 0.1, 986 | "y": 0.1 987 | }, 988 | "content": { 989 | "cell": "365590a4-6963-4812-a1cf-688f7b6bb9ff", 990 | "part": "whole" 991 | }, 992 | "id": "29546121-ed11-44b7-8144-0c44e874098f" 993 | } 994 | } 995 | }, 996 | "176eaccb-15dd-455d-bf07-504213e7aa01": { 997 | "id": "176eaccb-15dd-455d-bf07-504213e7aa01", 998 | "prev": "de6b30f4-2463-4901-92ed-aabad78e5e0f", 999 | "regions": { 1000 | "1542aa9e-dc55-4b90-adef-bf5181872b42": { 1001 | "attrs": { 1002 | "height": 0.8, 1003 | "width": 0.8, 1004 | "x": 0.1, 1005 | "y": 0.1 1006 | }, 1007 | "content": { 1008 | "cell": "5c242050-c1f7-4b3b-8103-2ea9d71a40dc", 1009 | "part": "whole" 1010 | }, 1011 | "id": "1542aa9e-dc55-4b90-adef-bf5181872b42" 1012 | } 1013 | } 1014 | }, 1015 | "19091b36-b0e7-49b1-b501-ec05937e0da9": { 1016 | "id": "19091b36-b0e7-49b1-b501-ec05937e0da9", 1017 | "prev": "1983c02e-fb99-4c05-a728-e0c0ad7c06d8", 1018 | "regions": { 1019 | "6529a31c-8d45-425c-b1d7-d0ac6fca6a32": { 1020 | "attrs": { 1021 | "height": 0.8, 1022 | "width": 0.8, 1023 | "x": 0.1, 1024 | "y": 0.1 1025 | }, 1026 | "content": { 1027 | "cell": "e766909d-9421-4aaf-9fb1-bc90d27e49e3", 1028 | "part": "whole" 1029 | }, 1030 | "id": "6529a31c-8d45-425c-b1d7-d0ac6fca6a32" 1031 | } 1032 | } 1033 | }, 1034 | "1983c02e-fb99-4c05-a728-e0c0ad7c06d8": { 1035 | "id": "1983c02e-fb99-4c05-a728-e0c0ad7c06d8", 1036 | "prev": "176eaccb-15dd-455d-bf07-504213e7aa01", 1037 | "regions": { 1038 | "1c4e605d-7f22-4f30-b3fb-74b2937e7a4a": { 1039 | "attrs": { 1040 | "height": 0.8, 1041 | "width": 0.8, 1042 | "x": 0.1, 1043 | "y": 0.1 1044 | }, 1045 | "content": { 1046 | "cell": "4d217204-e5c0-4568-bd30-12c2e41b681d", 1047 | "part": "whole" 1048 | }, 1049 | "id": "1c4e605d-7f22-4f30-b3fb-74b2937e7a4a" 1050 | } 1051 | } 1052 | }, 1053 | "212b111f-4527-459c-8297-1db5580ee5c9": { 1054 | "id": "212b111f-4527-459c-8297-1db5580ee5c9", 1055 | "prev": "76898529-e49e-4663-8d02-8261dfe1d94b", 1056 | "regions": { 1057 | "2e4bd280-3cd6-47d0-9c81-17737b24053b": { 1058 | "attrs": { 1059 | "height": 0.8, 1060 | "width": 0.8, 1061 | "x": 0.1, 1062 | "y": 0.1 1063 | }, 1064 | "content": { 1065 | "cell": "0c316996-9933-4b3d-82ec-259518dc8bc9", 1066 | "part": "whole" 1067 | }, 1068 | "id": "2e4bd280-3cd6-47d0-9c81-17737b24053b" 1069 | } 1070 | } 1071 | }, 1072 | "23a7337f-a0cf-4ed4-baa9-ec06bfdc0579": { 1073 | "id": "23a7337f-a0cf-4ed4-baa9-ec06bfdc0579", 1074 | "prev": "e76e9a02-54c1-4ec9-80fb-c611ed398122", 1075 | "regions": { 1076 | "b5721d20-d6f8-4ddb-a5aa-eb16f0cc8893": { 1077 | "attrs": { 1078 | "height": 0.8, 1079 | "width": 0.8, 1080 | "x": 0.1, 1081 | "y": 0.1 1082 | }, 1083 | "content": { 1084 | "cell": "313015a2-b5a9-4136-a8ea-5d011e47d840", 1085 | "part": "whole" 1086 | }, 1087 | "id": "b5721d20-d6f8-4ddb-a5aa-eb16f0cc8893" 1088 | } 1089 | } 1090 | }, 1091 | "732f1952-ee54-46fb-8067-099512824296": { 1092 | "id": "732f1952-ee54-46fb-8067-099512824296", 1093 | "prev": "0a2d4faf-9fb8-4cee-853b-ed68b90f3f8a", 1094 | "regions": { 1095 | "f31d4597-08ad-4c46-ad52-4bd2d775c624": { 1096 | "attrs": { 1097 | "height": 0.8, 1098 | "width": 0.8, 1099 | "x": 0.1, 1100 | "y": 0.1 1101 | }, 1102 | "content": { 1103 | "cell": "aadfec52-27ca-4541-8920-fa9253d51827", 1104 | "part": "whole" 1105 | }, 1106 | "id": "f31d4597-08ad-4c46-ad52-4bd2d775c624" 1107 | } 1108 | } 1109 | }, 1110 | "76898529-e49e-4663-8d02-8261dfe1d94b": { 1111 | "id": "76898529-e49e-4663-8d02-8261dfe1d94b", 1112 | "prev": "19091b36-b0e7-49b1-b501-ec05937e0da9", 1113 | "regions": { 1114 | "ec1746fc-aec9-4a7c-8225-9e9ac8d45889": { 1115 | "attrs": { 1116 | "height": 0.8, 1117 | "width": 0.8, 1118 | "x": 0.1, 1119 | "y": 0.1 1120 | }, 1121 | "content": { 1122 | "cell": "b3e539be-84e2-49ce-a183-31cfc5c7ce7c", 1123 | "part": "whole" 1124 | }, 1125 | "id": "ec1746fc-aec9-4a7c-8225-9e9ac8d45889" 1126 | } 1127 | } 1128 | }, 1129 | "8a712017-49b7-449f-8264-43a032ace902": { 1130 | "id": "8a712017-49b7-449f-8264-43a032ace902", 1131 | "prev": "bed9e696-630e-4747-be1c-bc3737ba992f", 1132 | "regions": { 1133 | "1faab517-cd16-4c63-bb01-a67246749d7a": { 1134 | "attrs": { 1135 | "height": 0.8, 1136 | "width": 0.8, 1137 | "x": 0.1, 1138 | "y": 0.1 1139 | }, 1140 | "content": { 1141 | "cell": "3f14ddf0-4145-4687-9c33-712c3c32520f", 1142 | "part": "whole" 1143 | }, 1144 | "id": "1faab517-cd16-4c63-bb01-a67246749d7a" 1145 | } 1146 | } 1147 | }, 1148 | "90af992d-eb6d-4496-b2d2-6aa9a95b6a61": { 1149 | "id": "90af992d-eb6d-4496-b2d2-6aa9a95b6a61", 1150 | "prev": "732f1952-ee54-46fb-8067-099512824296", 1151 | "regions": { 1152 | "4f448bec-5be9-4553-88ae-e35ed7612f25": { 1153 | "attrs": { 1154 | "height": 0.8, 1155 | "width": 0.8, 1156 | "x": 0.1, 1157 | "y": 0.1 1158 | }, 1159 | "content": { 1160 | "cell": "c445fbfb-b8ab-4e9a-8d13-12231a1c588f", 1161 | "part": "whole" 1162 | }, 1163 | "id": "4f448bec-5be9-4553-88ae-e35ed7612f25" 1164 | } 1165 | } 1166 | }, 1167 | "bed9e696-630e-4747-be1c-bc3737ba992f": { 1168 | "id": "bed9e696-630e-4747-be1c-bc3737ba992f", 1169 | "prev": "23a7337f-a0cf-4ed4-baa9-ec06bfdc0579", 1170 | "regions": { 1171 | "ac1513f0-404f-492b-8b42-0313e9a753b0": { 1172 | "attrs": { 1173 | "height": 0.8, 1174 | "width": 0.8, 1175 | "x": 0.1, 1176 | "y": 0.1 1177 | }, 1178 | "content": { 1179 | "cell": "18dd2eee-8b6c-4a5e-9539-132d00a7c7e1", 1180 | "part": "whole" 1181 | }, 1182 | "id": "ac1513f0-404f-492b-8b42-0313e9a753b0" 1183 | } 1184 | } 1185 | }, 1186 | "de6b30f4-2463-4901-92ed-aabad78e5e0f": { 1187 | "id": "de6b30f4-2463-4901-92ed-aabad78e5e0f", 1188 | "prev": "e36b4639-be8c-46f7-a8c9-bcfb134f9fd0", 1189 | "regions": { 1190 | "55ec36e0-362f-4fd3-8060-7cee056039aa": { 1191 | "attrs": { 1192 | "height": 0.8, 1193 | "width": 0.8, 1194 | "x": 0.1, 1195 | "y": 0.1 1196 | }, 1197 | "content": { 1198 | "cell": "c3cf461e-4c9e-4dec-99d2-07bfa79cbe23", 1199 | "part": "whole" 1200 | }, 1201 | "id": "55ec36e0-362f-4fd3-8060-7cee056039aa" 1202 | } 1203 | } 1204 | }, 1205 | "e36b4639-be8c-46f7-a8c9-bcfb134f9fd0": { 1206 | "id": "e36b4639-be8c-46f7-a8c9-bcfb134f9fd0", 1207 | "prev": "1519a4fa-1588-4644-98de-9c43bf0aceb5", 1208 | "regions": { 1209 | "4c1e9714-9ba0-45fd-8a2f-ef80a5c85c2e": { 1210 | "attrs": { 1211 | "height": 0.8, 1212 | "width": 0.8, 1213 | "x": 0.1, 1214 | "y": 0.1 1215 | }, 1216 | "content": { 1217 | "cell": "6534d4dc-125f-47e6-a022-cf1e0d277174", 1218 | "part": "whole" 1219 | }, 1220 | "id": "4c1e9714-9ba0-45fd-8a2f-ef80a5c85c2e" 1221 | } 1222 | } 1223 | }, 1224 | "e76e9a02-54c1-4ec9-80fb-c611ed398122": { 1225 | "id": "e76e9a02-54c1-4ec9-80fb-c611ed398122", 1226 | "prev": "90af992d-eb6d-4496-b2d2-6aa9a95b6a61", 1227 | "regions": { 1228 | "9491b84d-193b-40ff-9321-d21eb1ba88d4": { 1229 | "attrs": { 1230 | "height": 0.8, 1231 | "width": 0.8, 1232 | "x": 0.1, 1233 | "y": 0.1 1234 | }, 1235 | "content": { 1236 | "cell": "b64ec10e-fa9d-4f3f-907f-6799611ed6b1", 1237 | "part": "whole" 1238 | }, 1239 | "id": "9491b84d-193b-40ff-9321-d21eb1ba88d4" 1240 | } 1241 | } 1242 | } 1243 | }, 1244 | "themes": {} 1245 | } 1246 | }, 1247 | "nbformat": 4, 1248 | "nbformat_minor": 0 1249 | } 1250 | --------------------------------------------------------------------------------