├── .gitignore ├── LICENSE ├── README.md ├── pytorch ├── nlp │ ├── README.md │ ├── build_kaggle_dataset.py │ ├── build_vocab.py │ ├── data │ │ ├── kaggle │ │ │ └── .gitkeep │ │ └── small │ │ │ ├── test │ │ │ ├── labels.txt │ │ │ └── sentences.txt │ │ │ ├── train │ │ │ ├── labels.txt │ │ │ └── sentences.txt │ │ │ └── val │ │ │ ├── labels.txt │ │ │ └── sentences.txt │ ├── evaluate.py │ ├── experiments │ │ ├── base_model │ │ │ └── params.json │ │ └── learning_rate │ │ │ └── params.json │ ├── model │ │ ├── __init__.py │ │ ├── data_loader.py │ │ └── net.py │ ├── requirements.txt │ ├── search_hyperparams.py │ ├── synthesize_results.py │ ├── train.py │ └── utils.py └── vision │ ├── README.md │ ├── build_dataset.py │ ├── data │ └── .gitkeep │ ├── evaluate.py │ ├── experiments │ ├── .gitkeep │ ├── base_model │ │ └── params.json │ └── learning_rate │ │ └── params.json │ ├── model │ ├── __init__.py │ ├── data_loader.py │ └── net.py │ ├── requirements.txt │ ├── search_hyperparams.py │ ├── synthesize_results.py │ ├── train.py │ └── utils.py └── tensorflow ├── nlp ├── README.md ├── build_kaggle_dataset.py ├── build_vocab.py ├── data │ ├── kaggle │ │ └── .gitkeep │ └── small │ │ ├── dev │ │ ├── labels.txt │ │ └── sentences.txt │ │ ├── test │ │ ├── labels.txt │ │ └── sentences.txt │ │ └── train │ │ ├── labels.txt │ │ └── sentences.txt ├── evaluate.py ├── experiments │ ├── .gitkeep │ ├── base_model │ │ └── params.json │ └── learning_rate │ │ └── params.json ├── model │ ├── __init__.py │ ├── evaluation.py │ ├── input_fn.py │ ├── model_fn.py │ ├── training.py │ └── utils.py ├── requirements.txt ├── search_hyperparams.py ├── synthesize_results.py └── train.py └── vision ├── README.md ├── build_dataset.py ├── data └── .gitkeep ├── evaluate.py ├── experiments ├── .gitkeep ├── base_model │ └── params.json └── learning_rate │ └── params.json ├── model ├── __init__.py ├── evaluation.py ├── input_fn.py ├── model_fn.py ├── training.py └── utils.py ├── requirements.txt ├── search_hyperparams.py ├── synthesize_results.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | .static_storage/ 56 | .media/ 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 CS230 Teaching team 4 | 5 | Teaching assistants contributors (Winter 2018): Guillaume Genthial, Olivier Moindrot, Surag Nair. 6 | Instructors: Kian Katanforoosh, Andrew Ng. 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CS230 Code Examples 2 | 3 | [Tutorials](https://cs230-stanford.github.io) 4 | 5 | 6 | We are happy to introduce some code examples that you can use for your CS230 projects. The code contains examples for TensorFlow and PyTorch, in vision and NLP. The structure of the repository is the following: 7 | 8 | ``` 9 | README.md 10 | pytorch/ 11 | vision/ 12 | README.md 13 | nlp/ 14 | README.md 15 | tensorflow/ 16 | vision/ 17 | README.md 18 | nlp/ 19 | README.md 20 | ``` 21 | 22 | You'll find a README.md in each sub-directory. -------------------------------------------------------------------------------- /pytorch/nlp/README.md: -------------------------------------------------------------------------------- 1 | # Named Entity Recognition with PyTorch 2 | 3 | _Authors: Surag Nair, Guillaume Genthial and Olivier Moindrot_ 4 | 5 | Take the time to read the [tutorials](https://cs230-stanford.github.io/project-starter-code.html). 6 | 7 | Note : all scripts must be run in `pytorch/nlp`. 8 | 9 | ## Requirements 10 | 11 | We recommend using python3 and a virtual env. See instructions [here](https://cs230-stanford.github.io/project-starter-code.html). 12 | 13 | ``` 14 | virtualenv -p python3 .env 15 | source .env/bin/activate 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | When you're done working on the project, deactivate the virtual environment with `deactivate`. 20 | 21 | ## Task 22 | 23 | Given a sentence, give a tag to each word ([Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition)) 24 | 25 | ``` 26 | John lives in New York 27 | B-PER O O B-LOC I-LOC 28 | ``` 29 | 30 | ## [optional] Download the Kaggle dataset (~5 min) 31 | 32 | We provide a small subset of the kaggle dataset (30 sentences) for testing in `data/small` but you are encouraged to download the original version on the [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data) website. 33 | 34 | 1. **Download the dataset** `ner_dataset.csv` on [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data) and save it under the `nlp/data/kaggle` directory. Make sure you download the simple version `ner_dataset.csv` and NOT the full version `ner.csv`. 35 | 36 | 2. **Build the dataset** Run the following script 37 | 38 | ``` 39 | python build_kaggle_dataset.py 40 | ``` 41 | 42 | It will extract the sentences and labels from the dataset, split it into train/val/test and save it in a convenient format for our model. 43 | 44 | _Debug_ If you get some errors, check that you downloaded the right file and saved it in the right directory. If you have issues with encoding, try running the script with python 2.7. 45 | 46 | 3. In the next section, change `data/small` by `data/kaggle` 47 | 48 | ## Quickstart (~10 min) 49 | 50 | 1. **Build** vocabularies and parameters for your dataset by running 51 | 52 | ``` 53 | python build_vocab.py --data_dir data/small 54 | ``` 55 | 56 | It will write vocabulary files `words.txt` and `tags.txt` containing the words and tags in the dataset. It will also save a `dataset_params.json` with some extra information. 57 | 58 | 2. **Your first experiment** We created a `base_model` directory for you under the `experiments` directory. It contains a file `params.json` which sets the hyperparameters for the experiment. It looks like 59 | 60 | ```json 61 | { 62 | "learning_rate": 1e-3, 63 | "batch_size": 5, 64 | "num_epochs": 2 65 | } 66 | ``` 67 | 68 | For every new experiment, you will need to create a new directory under `experiments` with a `params.json` file. 69 | 70 | 3. **Train** your experiment. Simply run 71 | 72 | ``` 73 | python train.py --data_dir data/small --model_dir experiments/base_model 74 | ``` 75 | 76 | It will instantiate a model and train it on the training set following the hyperparameters specified in `params.json`. It will also evaluate some metrics on the development set. 77 | 78 | 4. **Your first hyperparameters search** We created a new directory `learning_rate` in `experiments` for you. Now, run 79 | 80 | ``` 81 | python search_hyperparams.py --data_dir data/small --parent_dir experiments/learning_rate 82 | ``` 83 | 84 | It will train and evaluate a model with different values of learning rate defined in `search_hyperparams.py` and create a new directory for each experiment under `experiments/learning_rate/`. 85 | 86 | 5. **Display the results** of the hyperparameters search in a nice format 87 | 88 | ``` 89 | python synthesize_results.py --parent_dir experiments/learning_rate 90 | ``` 91 | 92 | 6. **Evaluation on the test set** Once you've run many experiments and selected your best model and hyperparameters based on the performance on the development set, you can finally evaluate the performance of your model on the test set. Run 93 | 94 | ``` 95 | python evaluate.py --data_dir data/small --model_dir experiments/base_model 96 | ``` 97 | 98 | ## Guidelines for more advanced use 99 | 100 | We recommend reading through `train.py` to get a high-level overview of the training loop steps: 101 | 102 | - loading the hyperparameters for the experiment (the `params.json`) 103 | - loading the training and validation data 104 | - creating the model, loss_fn and metrics 105 | - training the model for a given number of epochs by calling `train_and_evaluate(...)` 106 | 107 | You can then go through `model/data_loader.py` to understand the following steps: 108 | 109 | - loading the vocabularies from the `words.txt` and `tags.txt` files 110 | - creating the sentences/labels datasets from the text files 111 | - how the vocabulary is used to map tokens to their indices 112 | - how the `data_iterator` creates a batch of data and labels and pads sentences 113 | 114 | Once you get the high-level idea, depending on your dataset, you might want to modify 115 | 116 | - `model/model.py` to change the neural network, loss function and metrics 117 | - `model/data_loader.py` to suit the data loader to your specific needs 118 | - `train.py` for changing the optimizer 119 | - `train.py` and `evaluate.py` for some changes in the model or input require changes here 120 | 121 | Once you get something working for your dataset, feel free to edit any part of the code to suit your own needs. 122 | 123 | ## Resources 124 | 125 | - [PyTorch documentation](http://pytorch.org/docs/1.2.0/) 126 | - [Tutorials](http://pytorch.org/tutorials/) 127 | - [PyTorch warm-up](https://github.com/jcjohnson/pytorch-examples) 128 | -------------------------------------------------------------------------------- /pytorch/nlp/build_kaggle_dataset.py: -------------------------------------------------------------------------------- 1 | """Read, split and save the kaggle dataset for our model""" 2 | 3 | import csv 4 | import os 5 | import sys 6 | 7 | 8 | def load_dataset(path_csv): 9 | """Loads dataset into memory from csv file""" 10 | # Open the csv file, need to specify the encoding for python3 11 | use_python3 = sys.version_info[0] >= 3 12 | with (open(path_csv, encoding="windows-1252") if use_python3 else open(path_csv)) as f: 13 | csv_file = csv.reader(f, delimiter=',') 14 | dataset = [] 15 | words, tags = [], [] 16 | 17 | # Each line of the csv corresponds to one word 18 | for idx, row in enumerate(csv_file): 19 | if idx == 0: continue 20 | sentence, word, pos, tag = row 21 | # If the first column is non empty it means we reached a new sentence 22 | if len(sentence) != 0: 23 | if len(words) > 0: 24 | assert len(words) == len(tags) 25 | dataset.append((words, tags)) 26 | words, tags = [], [] 27 | try: 28 | word, tag = str(word), str(tag) 29 | words.append(word) 30 | tags.append(tag) 31 | except UnicodeDecodeError as e: 32 | print("An exception was raised, skipping a word: {}".format(e)) 33 | pass 34 | 35 | return dataset 36 | 37 | 38 | def save_dataset(dataset, save_dir): 39 | """Writes sentences.txt and labels.txt files in save_dir from dataset 40 | 41 | Args: 42 | dataset: ([(["a", "cat"], ["O", "O"]), ...]) 43 | save_dir: (string) 44 | """ 45 | # Create directory if it doesn't exist 46 | print("Saving in {}...".format(save_dir)) 47 | if not os.path.exists(save_dir): 48 | os.makedirs(save_dir) 49 | 50 | # Export the dataset 51 | with open(os.path.join(save_dir, 'sentences.txt'), 'w') as file_sentences: 52 | with open(os.path.join(save_dir, 'labels.txt'), 'w') as file_labels: 53 | for words, tags in dataset: 54 | file_sentences.write("{}\n".format(" ".join(words))) 55 | file_labels.write("{}\n".format(" ".join(tags))) 56 | print("- done.") 57 | 58 | 59 | if __name__ == "__main__": 60 | # Check that the dataset exists (you need to make sure you haven't downloaded the `ner.csv`) 61 | path_dataset = 'data/kaggle/ner_dataset.csv' 62 | msg = "{} file not found. Make sure you have downloaded the right dataset".format(path_dataset) 63 | assert os.path.isfile(path_dataset), msg 64 | 65 | # Load the dataset into memory 66 | print("Loading Kaggle dataset into memory...") 67 | dataset = load_dataset(path_dataset) 68 | print("- done.") 69 | 70 | # Split the dataset into train, val and split (dummy split with no shuffle) 71 | train_dataset = dataset[:int(0.7*len(dataset))] 72 | val_dataset = dataset[int(0.7*len(dataset)) : int(0.85*len(dataset))] 73 | test_dataset = dataset[int(0.85*len(dataset)):] 74 | 75 | # Save the datasets to files 76 | save_dataset(train_dataset, 'data/kaggle/train') 77 | save_dataset(val_dataset, 'data/kaggle/val') 78 | save_dataset(test_dataset, 'data/kaggle/test') -------------------------------------------------------------------------------- /pytorch/nlp/build_vocab.py: -------------------------------------------------------------------------------- 1 | """Build vocabularies of words and tags from datasets""" 2 | 3 | import argparse 4 | from collections import Counter 5 | import json 6 | import os 7 | 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--min_count_word', default=1, help="Minimum count for words in the dataset", type=int) 11 | parser.add_argument('--min_count_tag', default=1, help="Minimum count for tags in the dataset", type=int) 12 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset") 13 | 14 | # Hyper parameters for the vocab 15 | PAD_WORD = '' 16 | PAD_TAG = 'O' 17 | UNK_WORD = 'UNK' 18 | 19 | 20 | def save_vocab_to_txt_file(vocab, txt_path): 21 | """Writes one token per line, 0-based line id corresponds to the id of the token. 22 | 23 | Args: 24 | vocab: (iterable object) yields token 25 | txt_path: (stirng) path to vocab file 26 | """ 27 | with open(txt_path, "w") as f: 28 | for token in vocab: 29 | f.write(token + '\n') 30 | 31 | 32 | def save_dict_to_json(d, json_path): 33 | """Saves dict to json file 34 | 35 | Args: 36 | d: (dict) 37 | json_path: (string) path to json file 38 | """ 39 | with open(json_path, 'w') as f: 40 | d = {k: v for k, v in d.items()} 41 | json.dump(d, f, indent=4) 42 | 43 | 44 | def update_vocab(txt_path, vocab): 45 | """Update word and tag vocabulary from dataset 46 | 47 | Args: 48 | txt_path: (string) path to file, one sentence per line 49 | vocab: (dict or Counter) with update method 50 | 51 | Returns: 52 | dataset_size: (int) number of elements in the dataset 53 | """ 54 | with open(txt_path) as f: 55 | for i, line in enumerate(f): 56 | vocab.update(line.strip().split(' ')) 57 | 58 | return i + 1 59 | 60 | 61 | if __name__ == '__main__': 62 | args = parser.parse_args() 63 | 64 | # Build word vocab with train and test datasets 65 | print("Building word vocabulary...") 66 | words = Counter() 67 | size_train_sentences = update_vocab(os.path.join(args.data_dir, 'train/sentences.txt'), words) 68 | size_dev_sentences = update_vocab(os.path.join(args.data_dir, 'val/sentences.txt'), words) 69 | size_test_sentences = update_vocab(os.path.join(args.data_dir, 'test/sentences.txt'), words) 70 | print("- done.") 71 | 72 | # Build tag vocab with train and test datasets 73 | print("Building tag vocabulary...") 74 | tags = Counter() 75 | size_train_tags = update_vocab(os.path.join(args.data_dir, 'train/labels.txt'), tags) 76 | size_dev_tags = update_vocab(os.path.join(args.data_dir, 'val/labels.txt'), tags) 77 | size_test_tags = update_vocab(os.path.join(args.data_dir, 'test/labels.txt'), tags) 78 | print("- done.") 79 | 80 | # Assert same number of examples in datasets 81 | assert size_train_sentences == size_train_tags 82 | assert size_dev_sentences == size_dev_tags 83 | assert size_test_sentences == size_test_tags 84 | 85 | # Only keep most frequent tokens 86 | words = [tok for tok, count in words.items() if count >= args.min_count_word] 87 | tags = [tok for tok, count in tags.items() if count >= args.min_count_tag] 88 | 89 | # Add pad tokens 90 | if PAD_WORD not in words: words.append(PAD_WORD) 91 | if PAD_TAG not in tags: tags.append(PAD_TAG) 92 | 93 | # add word for unknown words 94 | words.append(UNK_WORD) 95 | 96 | # Save vocabularies to file 97 | print("Saving vocabularies to file...") 98 | save_vocab_to_txt_file(words, os.path.join(args.data_dir, 'words.txt')) 99 | save_vocab_to_txt_file(tags, os.path.join(args.data_dir, 'tags.txt')) 100 | print("- done.") 101 | 102 | # Save datasets properties in json file 103 | sizes = { 104 | 'train_size': size_train_sentences, 105 | 'dev_size': size_dev_sentences, 106 | 'test_size': size_test_sentences, 107 | 'vocab_size': len(words), 108 | 'number_of_tags': len(tags), 109 | 'pad_word': PAD_WORD, 110 | 'pad_tag': PAD_TAG, 111 | 'unk_word': UNK_WORD 112 | } 113 | save_dict_to_json(sizes, os.path.join(args.data_dir, 'dataset_params.json')) 114 | 115 | # Logging sizes 116 | to_print = "\n".join("- {}: {}".format(k, v) for k, v in sizes.items()) 117 | print("Characteristics of the dataset:\n{}".format(to_print)) 118 | -------------------------------------------------------------------------------- /pytorch/nlp/data/kaggle/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/nlp/data/kaggle/.gitkeep -------------------------------------------------------------------------------- /pytorch/nlp/data/small/test/labels.txt: -------------------------------------------------------------------------------- 1 | O O B-org I-org I-org I-org O B-geo O B-gpe B-per I-per I-per I-per O O O O O O B-geo O O 2 | O O O O O O O O O 3 | O O O O O B-geo O O O B-tim O O O O O O O 4 | O O O O O O O O O O O O O B-geo O O O O O O O O O O 5 | O B-org I-org I-org I-org I-org O O O O B-org I-org I-org I-org I-org O O O O O O O O O O B-geo O 6 | O O O O O O O O O O B-geo O O B-org I-org I-org I-org O 7 | O O O O B-org O O O O O O O O O B-geo O O O O 8 | O O O O B-geo O O O O O O O O O O O O O O O O O O O B-geo O 9 | B-gpe O O O O O O O O O O O O O O B-gpe O O B-per I-per O 10 | O O O O O O B-tim O O O O O O O O O O B-geo O -------------------------------------------------------------------------------- /pytorch/nlp/data/small/test/sentences.txt: -------------------------------------------------------------------------------- 1 | At the Group of Eight summit in Scotland , Japanese Prime Minister Junichiro Koizumi said he is outraged by the London attacks . 2 | He noted terrorist acts must not be forgivable . 3 | Sarin gas attacks on the Tokyo subway system in 1995 killed 12 people and injured thousands . 4 | A human rights group has called on Asian leaders to increase pressure on Burma to hasten democratic reforms and stop human rights abuses . 5 | The Alternative ASEAN Network for Burma said officials from the Association of Southeast Asian Nations meeting this week should consider new options in dealing with Burma . 6 | It said leaders should consider supporting a possible resolution on Burma by the United Nations Security Council . 7 | The group also urged ASEAN leaders to acknowledge the many security problems caused by Burma 's military regime . 8 | The rights group accuses Burma 's government of involvement in illegal drug trafficking and human rights abuses , especially against some ethnic groups in Burma . 9 | Iraqi officials say gunmen have killed a member of the secular coalition led by former Iraqi prime minister Ayad Allawi . 10 | Officials say Faras al-Jabouri was shot Saturday after gunmen raided his home near the northern city of Mosul . -------------------------------------------------------------------------------- /pytorch/nlp/data/small/train/labels.txt: -------------------------------------------------------------------------------- 1 | O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O 2 | O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O 3 | O O O O O O O O O O O B-geo I-geo O 4 | O O O O O O O O O O O O O O O 5 | O O O O O O O O O O O B-geo O O B-org I-org O O O B-gpe O O O B-geo O 6 | O O O O O B-gpe O O O O B-geo O O O O O O O B-gpe O O O O O 7 | O B-geo O O O O O O O O O O O O B-geo O B-geo O O B-geo O 8 | O B-org I-org I-org I-org O O O O O O O O B-geo B-tim O O O O O B-gpe O O O O O O O 9 | B-gpe O O O O O O O O O O B-geo O O O 10 | B-gpe O O O O O O O O O O O O O O B-tim O O O B-org O O O O O -------------------------------------------------------------------------------- /pytorch/nlp/data/small/train/sentences.txt: -------------------------------------------------------------------------------- 1 | Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country . 2 | Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . " 3 | They marched from the Houses of Parliament to a rally in Hyde Park . 4 | Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 . 5 | The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton . 6 | The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country . 7 | The London march came ahead of anti-war protests today in other cities , including Rome , Paris , and Madrid . 8 | The International Atomic Energy Agency is to hold second day of talks in Vienna Wednesday on how to respond to Iran 's resumption of low-level uranium conversion . 9 | Iran this week restarted parts of the conversion process at its Isfahan nuclear plant . 10 | Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning . -------------------------------------------------------------------------------- /pytorch/nlp/data/small/val/labels.txt: -------------------------------------------------------------------------------- 1 | B-per I-per O O O B-tim O O B-tim O O O B-geo O O O O O O O O O O 2 | O O O O B-gpe B-per I-per I-per O O B-org I-org I-org I-org I-org O O O O O O O O B-geo O O O O O O O O O 3 | O B-org I-org I-org O O O O O O O O 4 | B-gpe O O O O O O O B-gpe O O O O O O O O O O O O O O O O B-geo O 5 | O O O B-org I-org O O O O O B-geo O O O O B-org O O O O B-gpe O O O O O 6 | O O O B-per I-per I-per O O B-tim O B-gpe O O O O O O O B-gpe O O B-geo O O O O O O O O B-geo O 7 | B-org O O O O O O O O O O O O O O O O O O O O O O 8 | O B-tim O B-gpe O O O O O O O O O O O O O O O O O O B-org O O O 9 | O B-gpe O O O O O O O O O O O O O O O O O O B-org O B-org O 10 | B-geo O O O O O O O O O O O O O O O O O O O O O O O -------------------------------------------------------------------------------- /pytorch/nlp/data/small/val/sentences.txt: -------------------------------------------------------------------------------- 1 | Mr. Nour was arrested in January and spent six weeks in a Cairo jail , before his release on bond last week . 2 | In a letter to Egyptian President Hosni Mubarak , the New York-based Human Rights Watch said it was dismayed by what it called Cairo 's " radical intolerance " toward political dissent . 3 | The U.S. State Department and the European parliament also voiced concern . 4 | Pakistani military officials say 14 of about 40 Pakistani soldiers who went missing following an attack on a security checkpoint have been found in neighboring Afghanistan . 5 | Officials say the Frontier Corps paramilitary troops disappeared from the Mohmand tribal region after a Taliban insurgent attack along the Afghan border earlier this week . 6 | Military spokesman Major General Athar Abbas told reporters Thursday that Afghan authorities handed over the troops to the Pakistani consulate in Jalalabad and the soldiers were being flown back to Pakistan . 7 | Taliban militants said they captured 10 soldiers during the attack on the paramilitary post , but officials could not verify the claim . 8 | On Wednesday , Pakistani officials said 10 paramilitary soldiers and at least 36 militants were killed in fighting in the country 's Bajaur tribal region . 9 | The Pakistani military has twice declared victory there following offensives aimed at clearing the area of insurgents linked to the Taliban and al-Qaida . 10 | Thailand 's military has named a committee to begin the process of writing a new constitution , following a military coup last month . -------------------------------------------------------------------------------- /pytorch/nlp/evaluate.py: -------------------------------------------------------------------------------- 1 | """Evaluates the model""" 2 | 3 | import argparse 4 | import logging 5 | import os 6 | 7 | import numpy as np 8 | import torch 9 | import utils 10 | import model.net as net 11 | from model.data_loader import DataLoader 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset") 15 | parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing params.json") 16 | parser.add_argument('--restore_file', default='best', help="name of the file in --model_dir \ 17 | containing weights to load") 18 | 19 | 20 | def evaluate(model, loss_fn, data_iterator, metrics, params, num_steps): 21 | """Evaluate the model on `num_steps` batches. 22 | 23 | Args: 24 | model: (torch.nn.Module) the neural network 25 | loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch 26 | data_iterator: (generator) a generator that generates batches of data and labels 27 | metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch 28 | params: (Params) hyperparameters 29 | num_steps: (int) number of batches to train on, each of size params.batch_size 30 | """ 31 | 32 | # set model to evaluation mode 33 | model.eval() 34 | 35 | # summary for current eval loop 36 | summ = [] 37 | 38 | # compute metrics over the dataset 39 | for _ in range(num_steps): 40 | # fetch the next evaluation batch 41 | data_batch, labels_batch = next(data_iterator) 42 | 43 | # compute model output 44 | output_batch = model(data_batch) 45 | loss = loss_fn(output_batch, labels_batch) 46 | 47 | # extract data from torch Variable, move to cpu, convert to numpy arrays 48 | output_batch = output_batch.data.cpu().numpy() 49 | labels_batch = labels_batch.data.cpu().numpy() 50 | 51 | # compute all metrics on this batch 52 | summary_batch = {metric: metrics[metric](output_batch, labels_batch) 53 | for metric in metrics} 54 | summary_batch['loss'] = loss.item() 55 | summ.append(summary_batch) 56 | 57 | # compute mean of all metrics in summary 58 | metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]} 59 | metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_mean.items()) 60 | logging.info("- Eval metrics : " + metrics_string) 61 | return metrics_mean 62 | 63 | 64 | if __name__ == '__main__': 65 | """ 66 | Evaluate the model on the test set. 67 | """ 68 | # Load the parameters 69 | args = parser.parse_args() 70 | json_path = os.path.join(args.model_dir, 'params.json') 71 | assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) 72 | params = utils.Params(json_path) 73 | 74 | # use GPU if available 75 | params.cuda = torch.cuda.is_available() # use GPU is available 76 | 77 | # Set the random seed for reproducible experiments 78 | torch.manual_seed(230) 79 | if params.cuda: torch.cuda.manual_seed(230) 80 | 81 | # Get the logger 82 | utils.set_logger(os.path.join(args.model_dir, 'evaluate.log')) 83 | 84 | # Create the input data pipeline 85 | logging.info("Creating the dataset...") 86 | 87 | # load data 88 | data_loader = DataLoader(args.data_dir, params) 89 | data = data_loader.load_data(['test'], args.data_dir) 90 | test_data = data['test'] 91 | 92 | # specify the test set size 93 | params.test_size = test_data['size'] 94 | test_data_iterator = data_loader.data_iterator(test_data, params) 95 | 96 | logging.info("- done.") 97 | 98 | # Define the model 99 | model = net.Net(params).cuda() if params.cuda else net.Net(params) 100 | 101 | loss_fn = net.loss_fn 102 | metrics = net.metrics 103 | 104 | logging.info("Starting evaluation") 105 | 106 | # Reload weights from the saved file 107 | utils.load_checkpoint(os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model) 108 | 109 | # Evaluate 110 | num_steps = (params.test_size + 1) // params.batch_size 111 | test_metrics = evaluate(model, loss_fn, test_data_iterator, metrics, params, num_steps) 112 | save_path = os.path.join(args.model_dir, "metrics_test_{}.json".format(args.restore_file)) 113 | utils.save_dict_to_json(test_metrics, save_path) 114 | -------------------------------------------------------------------------------- /pytorch/nlp/experiments/base_model/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "learning_rate": 1e-3, 3 | "batch_size": 5, 4 | "num_epochs": 10, 5 | 6 | "lstm_hidden_dim": 50, 7 | "embedding_dim": 50, 8 | 9 | "save_summary_steps": 100 10 | } 11 | -------------------------------------------------------------------------------- /pytorch/nlp/experiments/learning_rate/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "learning_rate": 1e-3, 3 | "batch_size": 5, 4 | "num_epochs": 10, 5 | 6 | "lstm_hidden_dim": 50, 7 | "embedding_dim": 50, 8 | 9 | "save_summary_steps": 100 10 | } 11 | -------------------------------------------------------------------------------- /pytorch/nlp/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/nlp/model/__init__.py -------------------------------------------------------------------------------- /pytorch/nlp/model/data_loader.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import os 4 | import sys 5 | 6 | import torch 7 | from torch.autograd import Variable 8 | 9 | import utils 10 | 11 | 12 | class DataLoader(object): 13 | """ 14 | Handles all aspects of the data. Stores the dataset_params, vocabulary and tags with their mappings to indices. 15 | """ 16 | def __init__(self, data_dir, params): 17 | """ 18 | Loads dataset_params, vocabulary and tags. Ensure you have run `build_vocab.py` on data_dir before using this 19 | class. 20 | 21 | Args: 22 | data_dir: (string) directory containing the dataset 23 | params: (Params) hyperparameters of the training process. This function modifies params and appends 24 | dataset_params (such as vocab size, num_of_tags etc.) to params. 25 | """ 26 | 27 | # loading dataset_params 28 | json_path = os.path.join(data_dir, 'dataset_params.json') 29 | assert os.path.isfile(json_path), "No json file found at {}, run build_vocab.py".format(json_path) 30 | self.dataset_params = utils.Params(json_path) 31 | 32 | # loading vocab (we require this to map words to their indices) 33 | vocab_path = os.path.join(data_dir, 'words.txt') 34 | self.vocab = {} 35 | with open(vocab_path) as f: 36 | for i, l in enumerate(f.read().splitlines()): 37 | self.vocab[l] = i 38 | 39 | # setting the indices for UNKnown words and PADding symbols 40 | self.unk_ind = self.vocab[self.dataset_params.unk_word] 41 | self.pad_ind = self.vocab[self.dataset_params.pad_word] 42 | 43 | # loading tags (we require this to map tags to their indices) 44 | tags_path = os.path.join(data_dir, 'tags.txt') 45 | self.tag_map = {} 46 | with open(tags_path) as f: 47 | for i, t in enumerate(f.read().splitlines()): 48 | self.tag_map[t] = i 49 | 50 | # adding dataset parameters to param (e.g. vocab size, ) 51 | params.update(json_path) 52 | 53 | def load_sentences_labels(self, sentences_file, labels_file, d): 54 | """ 55 | Loads sentences and labels from their corresponding files. Maps tokens and tags to their indices and stores 56 | them in the provided dict d. 57 | 58 | Args: 59 | sentences_file: (string) file with sentences with tokens space-separated 60 | labels_file: (string) file with NER tags for the sentences in labels_file 61 | d: (dict) a dictionary in which the loaded data is stored 62 | """ 63 | 64 | sentences = [] 65 | labels = [] 66 | 67 | with open(sentences_file) as f: 68 | for sentence in f.read().splitlines(): 69 | # replace each token by its index if it is in vocab 70 | # else use index of UNK_WORD 71 | s = [self.vocab[token] if token in self.vocab 72 | else self.unk_ind 73 | for token in sentence.split(' ')] 74 | sentences.append(s) 75 | 76 | with open(labels_file) as f: 77 | for sentence in f.read().splitlines(): 78 | # replace each label by its index 79 | l = [self.tag_map[label] for label in sentence.split(' ')] 80 | labels.append(l) 81 | 82 | # checks to ensure there is a tag for each token 83 | assert len(labels) == len(sentences) 84 | for i in range(len(labels)): 85 | assert len(labels[i]) == len(sentences[i]) 86 | 87 | # storing sentences and labels in dict d 88 | d['data'] = sentences 89 | d['labels'] = labels 90 | d['size'] = len(sentences) 91 | 92 | def load_data(self, types, data_dir): 93 | """ 94 | Loads the data for each type in types from data_dir. 95 | 96 | Args: 97 | types: (list) has one or more of 'train', 'val', 'test' depending on which data is required 98 | data_dir: (string) directory containing the dataset 99 | 100 | Returns: 101 | data: (dict) contains the data with labels for each type in types 102 | 103 | """ 104 | data = {} 105 | 106 | for split in ['train', 'val', 'test']: 107 | if split in types: 108 | sentences_file = os.path.join(data_dir, split, "sentences.txt") 109 | labels_file = os.path.join(data_dir, split, "labels.txt") 110 | data[split] = {} 111 | self.load_sentences_labels(sentences_file, labels_file, data[split]) 112 | 113 | return data 114 | 115 | def data_iterator(self, data, params, shuffle=False): 116 | """ 117 | Returns a generator that yields batches data with labels. Batch size is params.batch_size. Expires after one 118 | pass over the data. 119 | 120 | Args: 121 | data: (dict) contains data which has keys 'data', 'labels' and 'size' 122 | params: (Params) hyperparameters of the training process. 123 | shuffle: (bool) whether the data should be shuffled 124 | 125 | Yields: 126 | batch_data: (Variable) dimension batch_size x seq_len with the sentence data 127 | batch_labels: (Variable) dimension batch_size x seq_len with the corresponding labels 128 | 129 | """ 130 | 131 | # make a list that decides the order in which we go over the data- this avoids explicit shuffling of data 132 | order = list(range(data['size'])) 133 | if shuffle: 134 | random.seed(230) 135 | random.shuffle(order) 136 | 137 | # one pass over data 138 | for i in range((data['size']+1)//params.batch_size): 139 | # fetch sentences and tags 140 | batch_sentences = [data['data'][idx] for idx in order[i*params.batch_size:(i+1)*params.batch_size]] 141 | batch_tags = [data['labels'][idx] for idx in order[i*params.batch_size:(i+1)*params.batch_size]] 142 | 143 | # compute length of longest sentence in batch 144 | batch_max_len = max([len(s) for s in batch_sentences]) 145 | 146 | # prepare a numpy array with the data, initialising the data with pad_ind and all labels with -1 147 | # initialising labels to -1 differentiates tokens with tags from PADding tokens 148 | batch_data = self.pad_ind*np.ones((len(batch_sentences), batch_max_len)) 149 | batch_labels = -1*np.ones((len(batch_sentences), batch_max_len)) 150 | 151 | # copy the data to the numpy array 152 | for j in range(len(batch_sentences)): 153 | cur_len = len(batch_sentences[j]) 154 | batch_data[j][:cur_len] = batch_sentences[j] 155 | batch_labels[j][:cur_len] = batch_tags[j] 156 | 157 | # since all data are indices, we convert them to torch LongTensors 158 | batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels) 159 | 160 | # shift tensors to GPU if available 161 | if params.cuda: 162 | batch_data, batch_labels = batch_data.cuda(), batch_labels.cuda() 163 | 164 | # convert them to Variables to record operations in the computational graph 165 | batch_data, batch_labels = Variable(batch_data), Variable(batch_labels) 166 | 167 | yield batch_data, batch_labels 168 | -------------------------------------------------------------------------------- /pytorch/nlp/model/net.py: -------------------------------------------------------------------------------- 1 | """Defines the neural network, losss function and metrics""" 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | 9 | class Net(nn.Module): 10 | """ 11 | This is the standard way to define your own network in PyTorch. You typically choose the components 12 | (e.g. LSTMs, linear layers etc.) of your network in the __init__ function. You then apply these layers 13 | on the input step-by-step in the forward function. You can use torch.nn.functional to apply functions 14 | such as F.relu, F.sigmoid, F.softmax. Be careful to ensure your dimensions are correct after each step. 15 | 16 | You are encouraged to have a look at the network in pytorch/vision/model/net.py to get a better sense of how 17 | you can go about defining your own network. 18 | 19 | The documentation for all the various components available to you is here: http://pytorch.org/docs/master/nn.html 20 | """ 21 | 22 | def __init__(self, params): 23 | """ 24 | We define an recurrent network that predicts the NER tags for each token in the sentence. The components 25 | required are: 26 | 27 | - an embedding layer: this layer maps each index in range(params.vocab_size) to a params.embedding_dim vector 28 | - lstm: applying the LSTM on the sequential input returns an output for each token in the sentence 29 | - fc: a fully connected layer that converts the LSTM output for each token to a distribution over NER tags 30 | 31 | Args: 32 | params: (Params) contains vocab_size, embedding_dim, lstm_hidden_dim 33 | """ 34 | super(Net, self).__init__() 35 | 36 | # the embedding takes as input the vocab_size and the embedding_dim 37 | self.embedding = nn.Embedding(params.vocab_size, params.embedding_dim) 38 | 39 | # the LSTM takes as input the size of its input (embedding_dim), its hidden size 40 | # for more details on how to use it, check out the documentation 41 | self.lstm = nn.LSTM(params.embedding_dim, 42 | params.lstm_hidden_dim, batch_first=True) 43 | 44 | # the fully connected layer transforms the output to give the final output layer 45 | self.fc = nn.Linear(params.lstm_hidden_dim, params.number_of_tags) 46 | 47 | def forward(self, s): 48 | """ 49 | This function defines how we use the components of our network to operate on an input batch. 50 | 51 | Args: 52 | s: (Variable) contains a batch of sentences, of dimension batch_size x seq_len, where seq_len is 53 | the length of the longest sentence in the batch. For sentences shorter than seq_len, the remaining 54 | tokens are PADding tokens. Each row is a sentence with each element corresponding to the index of 55 | the token in the vocab. 56 | 57 | Returns: 58 | out: (Variable) dimension batch_size*seq_len x num_tags with the log probabilities of tokens for each token 59 | of each sentence. 60 | 61 | Note: the dimensions after each step are provided 62 | """ 63 | # -> batch_size x seq_len 64 | # apply the embedding layer that maps each token to its embedding 65 | # dim: batch_size x seq_len x embedding_dim 66 | s = self.embedding(s) 67 | 68 | # run the LSTM along the sentences of length seq_len 69 | # dim: batch_size x seq_len x lstm_hidden_dim 70 | s, _ = self.lstm(s) 71 | 72 | # make the Variable contiguous in memory (a PyTorch artefact) 73 | s = s.contiguous() 74 | 75 | # reshape the Variable so that each row contains one token 76 | # dim: batch_size*seq_len x lstm_hidden_dim 77 | s = s.view(-1, s.shape[2]) 78 | 79 | # apply the fully connected layer and obtain the output (before softmax) for each token 80 | s = self.fc(s) # dim: batch_size*seq_len x num_tags 81 | 82 | # apply log softmax on each token's output (this is recommended over applying softmax 83 | # since it is numerically more stable) 84 | return F.log_softmax(s, dim=1) # dim: batch_size*seq_len x num_tags 85 | 86 | 87 | def loss_fn(outputs, labels): 88 | """ 89 | Compute the cross entropy loss given outputs from the model and labels for all tokens. Exclude loss terms 90 | for PADding tokens. 91 | 92 | Args: 93 | outputs: (Variable) dimension batch_size*seq_len x num_tags - log softmax output of the model 94 | labels: (Variable) dimension batch_size x seq_len where each element is either a label in [0, 1, ... num_tag-1], 95 | or -1 in case it is a PADding token. 96 | 97 | Returns: 98 | loss: (Variable) cross entropy loss for all tokens in the batch 99 | 100 | Note: you may use a standard loss function from http://pytorch.org/docs/master/nn.html#loss-functions. This example 101 | demonstrates how you can easily define a custom loss function. 102 | """ 103 | 104 | # reshape labels to give a flat vector of length batch_size*seq_len 105 | labels = labels.view(-1) 106 | 107 | # since PADding tokens have label -1, we can generate a mask to exclude the loss from those terms 108 | mask = (labels >= 0).float() 109 | 110 | # indexing with negative values is not supported. Since PADded tokens have label -1, we convert them to a positive 111 | # number. This does not affect training, since we ignore the PADded tokens with the mask. 112 | labels = labels % outputs.shape[1] 113 | 114 | num_tokens = int(torch.sum(mask)) 115 | 116 | # compute cross entropy loss for all tokens (except PADding tokens), by multiplying with mask. 117 | return -torch.sum(outputs[range(outputs.shape[0]), labels]*mask)/num_tokens 118 | 119 | 120 | def accuracy(outputs, labels): 121 | """ 122 | Compute the accuracy, given the outputs and labels for all tokens. Exclude PADding terms. 123 | 124 | Args: 125 | outputs: (np.ndarray) dimension batch_size*seq_len x num_tags - log softmax output of the model 126 | labels: (np.ndarray) dimension batch_size x seq_len where each element is either a label in 127 | [0, 1, ... num_tag-1], or -1 in case it is a PADding token. 128 | 129 | Returns: (float) accuracy in [0,1] 130 | """ 131 | 132 | # reshape labels to give a flat vector of length batch_size*seq_len 133 | labels = labels.ravel() 134 | 135 | # since PADding tokens have label -1, we can generate a mask to exclude the loss from those terms 136 | mask = (labels >= 0) 137 | 138 | # np.argmax gives us the class predicted for each token by the model 139 | outputs = np.argmax(outputs, axis=1) 140 | 141 | # compare outputs with labels and divide by number of tokens (excluding PADding tokens) 142 | return np.sum(outputs == labels)/float(np.sum(mask)) 143 | 144 | 145 | # maintain all metrics required in this dictionary- these are used in the training and evaluation loops 146 | metrics = { 147 | 'accuracy': accuracy, 148 | # could add more metrics such as accuracy for each token type 149 | } 150 | -------------------------------------------------------------------------------- /pytorch/nlp/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | Pillow 3 | torch>=1.2 4 | tabulate 5 | tqdm 6 | -------------------------------------------------------------------------------- /pytorch/nlp/search_hyperparams.py: -------------------------------------------------------------------------------- 1 | """Peform hyperparemeters search""" 2 | 3 | import argparse 4 | import os 5 | from subprocess import check_call 6 | import sys 7 | 8 | import utils 9 | 10 | 11 | PYTHON = sys.executable 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--parent_dir', default='experiments/learning_rate', 14 | help='Directory containing params.json') 15 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset") 16 | 17 | 18 | def launch_training_job(parent_dir, data_dir, job_name, params): 19 | """Launch training of the model with a set of hyperparameters in parent_dir/job_name 20 | 21 | Args: 22 | model_dir: (string) directory containing config, weights and log 23 | data_dir: (string) directory containing the dataset 24 | params: (dict) containing hyperparameters 25 | """ 26 | # Create a new folder in parent_dir with unique_name "job_name" 27 | model_dir = os.path.join(parent_dir, job_name) 28 | if not os.path.exists(model_dir): 29 | os.makedirs(model_dir) 30 | 31 | # Write parameters in json file 32 | json_path = os.path.join(model_dir, 'params.json') 33 | params.save(json_path) 34 | 35 | # Launch training with this config 36 | cmd = "{python} train.py --model_dir={model_dir} --data_dir {data_dir}".format(python=PYTHON, model_dir=model_dir, 37 | data_dir=data_dir) 38 | print(cmd) 39 | check_call(cmd, shell=True) 40 | 41 | 42 | if __name__ == "__main__": 43 | # Load the "reference" parameters from parent_dir json file 44 | args = parser.parse_args() 45 | json_path = os.path.join(args.parent_dir, 'params.json') 46 | assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) 47 | params = utils.Params(json_path) 48 | 49 | # Perform hypersearch over one parameter 50 | learning_rates = [1e-4, 1e-3, 1e-2] 51 | 52 | for learning_rate in learning_rates: 53 | # Modify the relevant parameter in params 54 | params.learning_rate = learning_rate 55 | 56 | # Launch job (name has to be unique) 57 | job_name = "learning_rate_{}".format(learning_rate) 58 | launch_training_job(args.parent_dir, args.data_dir, job_name, params) 59 | -------------------------------------------------------------------------------- /pytorch/nlp/synthesize_results.py: -------------------------------------------------------------------------------- 1 | """Aggregates results from the metrics_eval_best_weights.json in a parent folder""" 2 | 3 | import argparse 4 | import json 5 | import os 6 | 7 | from tabulate import tabulate 8 | 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--parent_dir', default='experiments', 12 | help='Directory containing results of experiments') 13 | 14 | 15 | def aggregate_metrics(parent_dir, metrics): 16 | """Aggregate the metrics of all experiments in folder `parent_dir`. 17 | 18 | Assumes that `parent_dir` contains multiple experiments, with their results stored in 19 | `parent_dir/subdir/metrics_dev.json` 20 | 21 | Args: 22 | parent_dir: (string) path to directory containing experiments results 23 | metrics: (dict) subdir -> {'accuracy': ..., ...} 24 | """ 25 | # Get the metrics for the folder if it has results from an experiment 26 | metrics_file = os.path.join(parent_dir, 'metrics_val_best_weights.json') 27 | if os.path.isfile(metrics_file): 28 | with open(metrics_file, 'r') as f: 29 | metrics[parent_dir] = json.load(f) 30 | 31 | # Check every subdirectory of parent_dir 32 | for subdir in os.listdir(parent_dir): 33 | if not os.path.isdir(os.path.join(parent_dir, subdir)): 34 | continue 35 | else: 36 | aggregate_metrics(os.path.join(parent_dir, subdir), metrics) 37 | 38 | 39 | def metrics_to_table(metrics): 40 | # Get the headers from the first subdir. Assumes everything has the same metrics 41 | headers = metrics[list(metrics.keys())[0]].keys() 42 | table = [[subdir] + [values[h] for h in headers] for subdir, values in metrics.items()] 43 | res = tabulate(table, headers, tablefmt='pipe') 44 | 45 | return res 46 | 47 | 48 | if __name__ == "__main__": 49 | args = parser.parse_args() 50 | 51 | # Aggregate metrics from args.parent_dir directory 52 | metrics = dict() 53 | aggregate_metrics(args.parent_dir, metrics) 54 | table = metrics_to_table(metrics) 55 | 56 | # Display the table to terminal 57 | print(table) 58 | 59 | # Save results in parent_dir/results.md 60 | save_file = os.path.join(args.parent_dir, "results.md") 61 | with open(save_file, 'w') as f: 62 | f.write(table) -------------------------------------------------------------------------------- /pytorch/nlp/train.py: -------------------------------------------------------------------------------- 1 | """Train the model""" 2 | 3 | import argparse 4 | import logging 5 | import os 6 | 7 | import numpy as np 8 | import torch 9 | import torch.optim as optim 10 | from tqdm import trange 11 | 12 | import utils 13 | import model.net as net 14 | from model.data_loader import DataLoader 15 | from evaluate import evaluate 16 | 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--data_dir', default='data/small', 20 | help="Directory containing the dataset") 21 | parser.add_argument('--model_dir', default='experiments/base_model', 22 | help="Directory containing params.json") 23 | parser.add_argument('--restore_file', default=None, 24 | help="Optional, name of the file in --model_dir containing weights to reload before \ 25 | training") # 'best' or 'train' 26 | 27 | 28 | def train(model, optimizer, loss_fn, data_iterator, metrics, params, num_steps): 29 | """Train the model on `num_steps` batches 30 | 31 | Args: 32 | model: (torch.nn.Module) the neural network 33 | optimizer: (torch.optim) optimizer for parameters of model 34 | loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch 35 | data_iterator: (generator) a generator that generates batches of data and labels 36 | metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch 37 | params: (Params) hyperparameters 38 | num_steps: (int) number of batches to train on, each of size params.batch_size 39 | """ 40 | 41 | # set model to training mode 42 | model.train() 43 | 44 | # summary for current training loop and a running average object for loss 45 | summ = [] 46 | loss_avg = utils.RunningAverage() 47 | 48 | # Use tqdm for progress bar 49 | t = trange(num_steps) 50 | for i in t: 51 | # fetch the next training batch 52 | train_batch, labels_batch = next(data_iterator) 53 | 54 | # compute model output and loss 55 | output_batch = model(train_batch) 56 | loss = loss_fn(output_batch, labels_batch) 57 | 58 | # clear previous gradients, compute gradients of all variables wrt loss 59 | optimizer.zero_grad() 60 | loss.backward() 61 | 62 | # performs updates using calculated gradients 63 | optimizer.step() 64 | 65 | # Evaluate summaries only once in a while 66 | if i % params.save_summary_steps == 0: 67 | # extract data from torch Variable, move to cpu, convert to numpy arrays 68 | output_batch = output_batch.data.cpu().numpy() 69 | labels_batch = labels_batch.data.cpu().numpy() 70 | 71 | # compute all metrics on this batch 72 | summary_batch = {metric: metrics[metric](output_batch, labels_batch) 73 | for metric in metrics} 74 | summary_batch['loss'] = loss.item() 75 | summ.append(summary_batch) 76 | 77 | # update the average loss 78 | loss_avg.update(loss.item()) 79 | t.set_postfix(loss='{:05.3f}'.format(loss_avg())) 80 | 81 | # compute mean of all metrics in summary 82 | metrics_mean = {metric: np.mean([x[metric] 83 | for x in summ]) for metric in summ[0]} 84 | metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) 85 | for k, v in metrics_mean.items()) 86 | logging.info("- Train metrics: " + metrics_string) 87 | 88 | 89 | def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): 90 | """Train the model and evaluate every epoch. 91 | 92 | Args: 93 | model: (torch.nn.Module) the neural network 94 | train_data: (dict) training data with keys 'data' and 'labels' 95 | val_data: (dict) validaion data with keys 'data' and 'labels' 96 | optimizer: (torch.optim) optimizer for parameters of model 97 | loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch 98 | metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch 99 | params: (Params) hyperparameters 100 | model_dir: (string) directory containing config, weights and log 101 | restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) 102 | """ 103 | # reload weights from restore_file if specified 104 | if restore_file is not None: 105 | restore_path = os.path.join( 106 | args.model_dir, args.restore_file + '.pth.tar') 107 | logging.info("Restoring parameters from {}".format(restore_path)) 108 | utils.load_checkpoint(restore_path, model, optimizer) 109 | 110 | best_val_acc = 0.0 111 | 112 | for epoch in range(params.num_epochs): 113 | # Run one epoch 114 | logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) 115 | 116 | # compute number of batches in one epoch (one full pass over the training set) 117 | num_steps = (params.train_size + 1) // params.batch_size 118 | train_data_iterator = data_loader.data_iterator( 119 | train_data, params, shuffle=True) 120 | train(model, optimizer, loss_fn, train_data_iterator, 121 | metrics, params, num_steps) 122 | 123 | # Evaluate for one epoch on validation set 124 | num_steps = (params.val_size + 1) // params.batch_size 125 | val_data_iterator = data_loader.data_iterator( 126 | val_data, params, shuffle=False) 127 | val_metrics = evaluate( 128 | model, loss_fn, val_data_iterator, metrics, params, num_steps) 129 | 130 | val_acc = val_metrics['accuracy'] 131 | is_best = val_acc >= best_val_acc 132 | 133 | # Save weights 134 | utils.save_checkpoint({'epoch': epoch + 1, 135 | 'state_dict': model.state_dict(), 136 | 'optim_dict': optimizer.state_dict()}, 137 | is_best=is_best, 138 | checkpoint=model_dir) 139 | 140 | # If best_eval, best_save_path 141 | if is_best: 142 | logging.info("- Found new best accuracy") 143 | best_val_acc = val_acc 144 | 145 | # Save best val metrics in a json file in the model directory 146 | best_json_path = os.path.join( 147 | model_dir, "metrics_val_best_weights.json") 148 | utils.save_dict_to_json(val_metrics, best_json_path) 149 | 150 | # Save latest val metrics in a json file in the model directory 151 | last_json_path = os.path.join( 152 | model_dir, "metrics_val_last_weights.json") 153 | utils.save_dict_to_json(val_metrics, last_json_path) 154 | 155 | 156 | if __name__ == '__main__': 157 | 158 | # Load the parameters from json file 159 | args = parser.parse_args() 160 | json_path = os.path.join(args.model_dir, 'params.json') 161 | assert os.path.isfile( 162 | json_path), "No json configuration file found at {}".format(json_path) 163 | params = utils.Params(json_path) 164 | 165 | # use GPU if available 166 | params.cuda = torch.cuda.is_available() 167 | 168 | # Set the random seed for reproducible experiments 169 | torch.manual_seed(230) 170 | if params.cuda: 171 | torch.cuda.manual_seed(230) 172 | 173 | # Set the logger 174 | utils.set_logger(os.path.join(args.model_dir, 'train.log')) 175 | 176 | # Create the input data pipeline 177 | logging.info("Loading the datasets...") 178 | 179 | # load data 180 | data_loader = DataLoader(args.data_dir, params) 181 | data = data_loader.load_data(['train', 'val'], args.data_dir) 182 | train_data = data['train'] 183 | val_data = data['val'] 184 | 185 | # specify the train and val dataset sizes 186 | params.train_size = train_data['size'] 187 | params.val_size = val_data['size'] 188 | 189 | logging.info("- done.") 190 | 191 | # Define the model and optimizer 192 | model = net.Net(params).cuda() if params.cuda else net.Net(params) 193 | optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) 194 | 195 | # fetch loss function and metrics 196 | loss_fn = net.loss_fn 197 | metrics = net.metrics 198 | 199 | # Train the model 200 | logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) 201 | train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, args.model_dir, 202 | args.restore_file) 203 | -------------------------------------------------------------------------------- /pytorch/nlp/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import shutil 5 | 6 | import torch 7 | 8 | 9 | class Params(): 10 | """Class that loads hyperparameters from a json file. 11 | 12 | Example: 13 | ``` 14 | params = Params(json_path) 15 | print(params.learning_rate) 16 | params.learning_rate = 0.5 # change the value of learning_rate in params 17 | ``` 18 | """ 19 | 20 | def __init__(self, json_path): 21 | with open(json_path) as f: 22 | params = json.load(f) 23 | self.__dict__.update(params) 24 | 25 | def save(self, json_path): 26 | with open(json_path, 'w') as f: 27 | json.dump(self.__dict__, f, indent=4) 28 | 29 | def update(self, json_path): 30 | """Loads parameters from json file""" 31 | with open(json_path) as f: 32 | params = json.load(f) 33 | self.__dict__.update(params) 34 | 35 | @property 36 | def dict(self): 37 | """Gives dict-like access to Params instance by `params.dict['learning_rate']""" 38 | return self.__dict__ 39 | 40 | 41 | class RunningAverage(): 42 | """A simple class that maintains the running average of a quantity 43 | 44 | Example: 45 | ``` 46 | loss_avg = RunningAverage() 47 | loss_avg.update(2) 48 | loss_avg.update(4) 49 | loss_avg() = 3 50 | ``` 51 | """ 52 | 53 | def __init__(self): 54 | self.steps = 0 55 | self.total = 0 56 | 57 | def update(self, val): 58 | self.total += val 59 | self.steps += 1 60 | 61 | def __call__(self): 62 | return self.total / float(self.steps) 63 | 64 | 65 | def set_logger(log_path): 66 | """Set the logger to log info in terminal and file `log_path`. 67 | 68 | In general, it is useful to have a logger so that every output to the terminal is saved 69 | in a permanent file. Here we save it to `model_dir/train.log`. 70 | 71 | Example: 72 | ``` 73 | logging.info("Starting training...") 74 | ``` 75 | 76 | Args: 77 | log_path: (string) where to log 78 | """ 79 | logger = logging.getLogger() 80 | logger.setLevel(logging.INFO) 81 | 82 | if not logger.handlers: 83 | # Logging to a file 84 | file_handler = logging.FileHandler(log_path) 85 | file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) 86 | logger.addHandler(file_handler) 87 | 88 | # Logging to console 89 | stream_handler = logging.StreamHandler() 90 | stream_handler.setFormatter(logging.Formatter('%(message)s')) 91 | logger.addHandler(stream_handler) 92 | 93 | 94 | def save_dict_to_json(d, json_path): 95 | """Saves dict of floats in json file 96 | 97 | Args: 98 | d: (dict) of float-castable values (np.float, int, float, etc.) 99 | json_path: (string) path to json file 100 | """ 101 | with open(json_path, 'w') as f: 102 | # We need to convert the values to float for json (it doesn't accept np.array, np.float, ) 103 | d = {k: float(v) for k, v in d.items()} 104 | json.dump(d, f, indent=4) 105 | 106 | 107 | def save_checkpoint(state, is_best, checkpoint): 108 | """Saves model and training parameters at checkpoint + 'last.pth.tar'. If is_best==True, also saves 109 | checkpoint + 'best.pth.tar' 110 | 111 | Args: 112 | state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict 113 | is_best: (bool) True if it is the best model seen till now 114 | checkpoint: (string) folder where parameters are to be saved 115 | """ 116 | filepath = os.path.join(checkpoint, 'last.pth.tar') 117 | if not os.path.exists(checkpoint): 118 | print("Checkpoint Directory does not exist! Making directory {}".format(checkpoint)) 119 | os.mkdir(checkpoint) 120 | else: 121 | print("Checkpoint Directory exists! ") 122 | torch.save(state, filepath) 123 | if is_best: 124 | shutil.copyfile(filepath, os.path.join(checkpoint, 'best.pth.tar')) 125 | 126 | 127 | def load_checkpoint(checkpoint, model, optimizer=None): 128 | """Loads model parameters (state_dict) from file_path. If optimizer is provided, loads state_dict of 129 | optimizer assuming it is present in checkpoint. 130 | 131 | Args: 132 | checkpoint: (string) filename which needs to be loaded 133 | model: (torch.nn.Module) model for which the parameters are loaded 134 | optimizer: (torch.optim) optional: resume optimizer from checkpoint 135 | """ 136 | if not os.path.exists(checkpoint): 137 | raise ("File doesn't exist {}".format(checkpoint)) 138 | checkpoint = torch.load(checkpoint) 139 | model.load_state_dict(checkpoint['state_dict']) 140 | 141 | if optimizer: 142 | optimizer.load_state_dict(checkpoint['optim_dict']) 143 | 144 | return checkpoint -------------------------------------------------------------------------------- /pytorch/vision/README.md: -------------------------------------------------------------------------------- 1 | # Hand Signs Recognition with PyTorch 2 | 3 | *Authors: Surag Nair, Olivier Moindrot and Guillaume Genthial* 4 | 5 | Take the time to read the [tutorials](https://cs230-stanford.github.io/project-starter-code.html). 6 | 7 | Note: all scripts must be run in folder `pytorch/vision`. 8 | 9 | ## Requirements 10 | 11 | We recommend using python3 and a virtual env. See instructions [here](https://cs230-stanford.github.io/project-starter-code.html). 12 | 13 | ``` 14 | virtualenv -p python3 .env 15 | source .env/bin/activate 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | When you're done working on the project, deactivate the virtual environment with `deactivate`. 20 | 21 | ## Task 22 | 23 | Given an image of a hand doing a sign representing 0, 1, 2, 3, 4 or 5, predict the correct label. 24 | 25 | 26 | ## Download the SIGNS dataset 27 | 28 | For the vision example, we will used the SIGNS dataset created for this class. The dataset is hosted on google drive, download it [here][SIGNS]. 29 | 30 | This will download the SIGNS dataset (~1.1 GB) containing photos of hands signs making numbers between 0 and 5. 31 | Here is the structure of the data: 32 | ``` 33 | SIGNS/ 34 | train_signs/ 35 | 0_IMG_5864.jpg 36 | ... 37 | test_signs/ 38 | 0_IMG_5942.jpg 39 | ... 40 | ``` 41 | 42 | The images are named following `{label}_IMG_{id}.jpg` where the label is in `[0, 5]`. 43 | The training set contains 1,080 images and the test set contains 120 images. 44 | 45 | Once the download is complete, move the dataset into `data/SIGNS`. 46 | Run the script `build_dataset.py` which will resize the images to size `(64, 64)`. The new resized dataset will be located by default in `data/64x64_SIGNS`: 47 | 48 | ```bash 49 | python build_dataset.py --data_dir data/SIGNS --output_dir data/64x64_SIGNS 50 | ``` 51 | 52 | 53 | 54 | ## Quickstart (~10 min) 55 | 56 | 1. __Build the dataset of size 64x64__: make sure you complete this step before training 57 | ```bash 58 | python build_dataset.py --data_dir data/SIGNS --output_dir data/64x64_SIGNS 59 | ``` 60 | 61 | 2. __Your first experiment__ We created a `base_model` directory for you under the `experiments` directory. It contains a file `params.json` which sets the hyperparameters for the experiment. It looks like 62 | ```json 63 | { 64 | "learning_rate": 1e-3, 65 | "batch_size": 32, 66 | "num_epochs": 10, 67 | ... 68 | } 69 | ``` 70 | For every new experiment, you will need to create a new directory under `experiments` with a similar `params.json` file. 71 | 72 | 3. __Train__ your experiment. Simply run 73 | ``` 74 | python train.py --data_dir data/64x64_SIGNS --model_dir experiments/base_model 75 | ``` 76 | It will instantiate a model and train it on the training set following the hyperparameters specified in `params.json`. It will also evaluate some metrics on the validation set. 77 | 78 | 4. __Your first hyperparameters search__ We created a new directory `learning_rate` in `experiments` for you. Now, run 79 | ``` 80 | python search_hyperparams.py --data_dir data/64x64_SIGNS --parent_dir experiments/learning_rate 81 | ``` 82 | It will train and evaluate a model with different values of learning rate defined in `search_hyperparams.py` and create a new directory for each experiment under `experiments/learning_rate/`. 83 | 84 | 5. __Display the results__ of the hyperparameters search in a nice format 85 | ``` 86 | python synthesize_results.py --parent_dir experiments/learning_rate 87 | ``` 88 | 89 | 6. __Evaluation on the test set__ Once you've run many experiments and selected your best model and hyperparameters based on the performance on the validation set, you can finally evaluate the performance of your model on the test set. Run 90 | ``` 91 | python evaluate.py --data_dir data/64x64_SIGNS --model_dir experiments/base_model 92 | ``` 93 | 94 | 95 | ## Guidelines for more advanced use 96 | 97 | We recommend reading through `train.py` to get a high-level overview of the training loop steps: 98 | - loading the hyperparameters for the experiment (the `params.json`) 99 | - loading the training and validation data 100 | - creating the model, loss_fn and metrics 101 | - training the model for a given number of epochs by calling `train_and_evaluate(...)` 102 | 103 | You can then have a look at `data_loader.py` to understand: 104 | - how jpg images are loaded and transformed to torch Tensors 105 | - how the `data_iterator` creates a batch of data and labels and pads sentences 106 | 107 | Once you get the high-level idea, depending on your dataset, you might want to modify 108 | - `model/net.py` to change the neural network, loss function and metrics 109 | - `model/data_loader.py` to suit the data loader to your specific needs 110 | - `train.py` for changing the optimizer 111 | - `train.py` and `evaluate.py` for some changes in the model or input require changes here 112 | 113 | Once you get something working for your dataset, feel free to edit any part of the code to suit your own needs. 114 | 115 | ## Resources 116 | 117 | - [PyTorch documentation](http://pytorch.org/docs/0.3.0/) 118 | - [Tutorials](http://pytorch.org/tutorials/) 119 | - [PyTorch warm-up](https://github.com/jcjohnson/pytorch-examples) 120 | 121 | [SIGNS]: https://drive.google.com/file/d/1ufiR6hUKhXoAyiBNsySPkUwlvE_wfEHC/view?usp=sharing 122 | -------------------------------------------------------------------------------- /pytorch/vision/build_dataset.py: -------------------------------------------------------------------------------- 1 | """Split the SIGNS dataset into train/val/test and resize images to 64x64. 2 | 3 | The SIGNS dataset comes into the following format: 4 | train_signs/ 5 | 0_IMG_5864.jpg 6 | ... 7 | test_signs/ 8 | 0_IMG_5942.jpg 9 | ... 10 | 11 | Original images have size (3024, 3024). 12 | Resizing to (64, 64) reduces the dataset size from 1.16 GB to 4.7 MB, and loading smaller images 13 | makes training faster. 14 | 15 | We already have a test set created, so we only need to split "train_signs" into train and val sets. 16 | Because we don't have a lot of images and we want that the statistics on the val set be as 17 | representative as possible, we'll take 20% of "train_signs" as val set. 18 | """ 19 | 20 | import argparse 21 | import random 22 | import os 23 | 24 | from PIL import Image 25 | from tqdm import tqdm 26 | 27 | SIZE = 64 28 | 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('--data_dir', default='data/SIGNS', help="Directory with the SIGNS dataset") 31 | parser.add_argument('--output_dir', default='data/64x64_SIGNS', help="Where to write the new data") 32 | 33 | 34 | def resize_and_save(filename, output_dir, size=SIZE): 35 | """Resize the image contained in `filename` and save it to the `output_dir`""" 36 | image = Image.open(filename) 37 | # Use bilinear interpolation instead of the default "nearest neighbor" method 38 | image = image.resize((size, size), Image.BILINEAR) 39 | image.save(os.path.join(output_dir, filename.split('/')[-1])) 40 | 41 | 42 | if __name__ == '__main__': 43 | args = parser.parse_args() 44 | 45 | assert os.path.isdir(args.data_dir), "Couldn't find the dataset at {}".format(args.data_dir) 46 | 47 | # Define the data directories 48 | train_data_dir = os.path.join(args.data_dir, 'train_signs') 49 | test_data_dir = os.path.join(args.data_dir, 'test_signs') 50 | 51 | # Get the filenames in each directory (train and test) 52 | filenames = os.listdir(train_data_dir) 53 | filenames = [os.path.join(train_data_dir, f) for f in filenames if f.endswith('.jpg')] 54 | 55 | test_filenames = os.listdir(test_data_dir) 56 | test_filenames = [os.path.join(test_data_dir, f) for f in test_filenames if f.endswith('.jpg')] 57 | 58 | # Split the images in 'train_signs' into 80% train and 20% val 59 | # Make sure to always shuffle with a fixed seed so that the split is reproducible 60 | random.seed(230) 61 | filenames.sort() 62 | random.shuffle(filenames) 63 | 64 | split = int(0.8 * len(filenames)) 65 | train_filenames = filenames[:split] 66 | val_filenames = filenames[split:] 67 | 68 | filenames = {'train': train_filenames, 69 | 'val': val_filenames, 70 | 'test': test_filenames} 71 | 72 | if not os.path.exists(args.output_dir): 73 | os.mkdir(args.output_dir) 74 | else: 75 | print("Warning: output dir {} already exists".format(args.output_dir)) 76 | 77 | # Preprocess train, val and test 78 | for split in ['train', 'val', 'test']: 79 | output_dir_split = os.path.join(args.output_dir, '{}_signs'.format(split)) 80 | if not os.path.exists(output_dir_split): 81 | os.mkdir(output_dir_split) 82 | else: 83 | print("Warning: dir {} already exists".format(output_dir_split)) 84 | 85 | print("Processing {} data, saving preprocessed data to {}".format(split, output_dir_split)) 86 | for filename in tqdm(filenames[split]): 87 | resize_and_save(filename, output_dir_split, size=SIZE) 88 | 89 | print("Done building dataset") 90 | -------------------------------------------------------------------------------- /pytorch/vision/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/vision/data/.gitkeep -------------------------------------------------------------------------------- /pytorch/vision/evaluate.py: -------------------------------------------------------------------------------- 1 | """Evaluates the model""" 2 | 3 | import argparse 4 | import logging 5 | import os 6 | 7 | import numpy as np 8 | import torch 9 | from torch.autograd import Variable 10 | import utils 11 | import model.net as net 12 | import model.data_loader as data_loader 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--data_dir', default='data/64x64_SIGNS', 16 | help="Directory containing the dataset") 17 | parser.add_argument('--model_dir', default='experiments/base_model', 18 | help="Directory containing params.json") 19 | parser.add_argument('--restore_file', default='best', help="name of the file in --model_dir \ 20 | containing weights to load") 21 | 22 | 23 | def evaluate(model, loss_fn, dataloader, metrics, params): 24 | """Evaluate the model on `num_steps` batches. 25 | 26 | Args: 27 | model: (torch.nn.Module) the neural network 28 | loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch 29 | dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches data 30 | metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch 31 | params: (Params) hyperparameters 32 | num_steps: (int) number of batches to train on, each of size params.batch_size 33 | """ 34 | 35 | # set model to evaluation mode 36 | model.eval() 37 | 38 | # summary for current eval loop 39 | summ = [] 40 | 41 | # compute metrics over the dataset 42 | for data_batch, labels_batch in dataloader: 43 | 44 | # move to GPU if available 45 | if params.cuda: 46 | data_batch, labels_batch = data_batch.cuda( 47 | non_blocking=True), labels_batch.cuda(non_blocking=True) 48 | # fetch the next evaluation batch 49 | data_batch, labels_batch = Variable(data_batch), Variable(labels_batch) 50 | 51 | # compute model output 52 | output_batch = model(data_batch) 53 | loss = loss_fn(output_batch, labels_batch) 54 | 55 | # extract data from torch Variable, move to cpu, convert to numpy arrays 56 | output_batch = output_batch.data.cpu().numpy() 57 | labels_batch = labels_batch.data.cpu().numpy() 58 | 59 | # compute all metrics on this batch 60 | summary_batch = {metric: metrics[metric](output_batch, labels_batch) 61 | for metric in metrics} 62 | summary_batch['loss'] = loss.item() 63 | summ.append(summary_batch) 64 | 65 | # compute mean of all metrics in summary 66 | metrics_mean = {metric: np.mean([x[metric] 67 | for x in summ]) for metric in summ[0]} 68 | metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) 69 | for k, v in metrics_mean.items()) 70 | logging.info("- Eval metrics : " + metrics_string) 71 | return metrics_mean 72 | 73 | 74 | if __name__ == '__main__': 75 | """ 76 | Evaluate the model on the test set. 77 | """ 78 | # Load the parameters 79 | args = parser.parse_args() 80 | json_path = os.path.join(args.model_dir, 'params.json') 81 | assert os.path.isfile( 82 | json_path), "No json configuration file found at {}".format(json_path) 83 | params = utils.Params(json_path) 84 | 85 | # use GPU if available 86 | params.cuda = torch.cuda.is_available() # use GPU is available 87 | 88 | # Set the random seed for reproducible experiments 89 | torch.manual_seed(230) 90 | if params.cuda: 91 | torch.cuda.manual_seed(230) 92 | 93 | # Get the logger 94 | utils.set_logger(os.path.join(args.model_dir, 'evaluate.log')) 95 | 96 | # Create the input data pipeline 97 | logging.info("Creating the dataset...") 98 | 99 | # fetch dataloaders 100 | dataloaders = data_loader.fetch_dataloader(['test'], args.data_dir, params) 101 | test_dl = dataloaders['test'] 102 | 103 | logging.info("- done.") 104 | 105 | # Define the model 106 | model = net.Net(params).cuda() if params.cuda else net.Net(params) 107 | 108 | loss_fn = net.loss_fn 109 | metrics = net.metrics 110 | 111 | logging.info("Starting evaluation") 112 | 113 | # Reload weights from the saved file 114 | utils.load_checkpoint(os.path.join( 115 | args.model_dir, args.restore_file + '.pth.tar'), model) 116 | 117 | # Evaluate 118 | test_metrics = evaluate(model, loss_fn, test_dl, metrics, params) 119 | save_path = os.path.join( 120 | args.model_dir, "metrics_test_{}.json".format(args.restore_file)) 121 | utils.save_dict_to_json(test_metrics, save_path) 122 | -------------------------------------------------------------------------------- /pytorch/vision/experiments/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/vision/experiments/.gitkeep -------------------------------------------------------------------------------- /pytorch/vision/experiments/base_model/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "learning_rate": 1e-3, 3 | "batch_size": 32, 4 | "num_epochs": 10, 5 | "dropout_rate":0.8, 6 | "num_channels": 32, 7 | "save_summary_steps": 100, 8 | "num_workers": 4 9 | } 10 | -------------------------------------------------------------------------------- /pytorch/vision/experiments/learning_rate/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "learning_rate": 1e-3, 3 | "batch_size": 32, 4 | "num_epochs": 10, 5 | "dropout_rate":0.8, 6 | "num_channels": 32, 7 | "save_summary_steps": 100, 8 | "num_workers": 4 9 | } 10 | -------------------------------------------------------------------------------- /pytorch/vision/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/vision/model/__init__.py -------------------------------------------------------------------------------- /pytorch/vision/model/data_loader.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | 4 | from PIL import Image 5 | from torch.utils.data import Dataset, DataLoader 6 | import torchvision.transforms as transforms 7 | 8 | # borrowed from http://pytorch.org/tutorials/advanced/neural_style_tutorial.html 9 | # and http://pytorch.org/tutorials/beginner/data_loading_tutorial.html 10 | # define a training image loader that specifies transforms on images. See documentation for more details. 11 | train_transformer = transforms.Compose([ 12 | transforms.Resize(64), # resize the image to 64x64 (remove if images are already 64x64) 13 | transforms.RandomHorizontalFlip(), # randomly flip image horizontally 14 | transforms.ToTensor()]) # transform it into a torch tensor 15 | 16 | # loader for evaluation, no horizontal flip 17 | eval_transformer = transforms.Compose([ 18 | transforms.Resize(64), # resize the image to 64x64 (remove if images are already 64x64) 19 | transforms.ToTensor()]) # transform it into a torch tensor 20 | 21 | 22 | class SIGNSDataset(Dataset): 23 | """ 24 | A standard PyTorch definition of Dataset which defines the functions __len__ and __getitem__. 25 | """ 26 | def __init__(self, data_dir, transform): 27 | """ 28 | Store the filenames of the jpgs to use. Specifies transforms to apply on images. 29 | 30 | Args: 31 | data_dir: (string) directory containing the dataset 32 | transform: (torchvision.transforms) transformation to apply on image 33 | """ 34 | self.filenames = os.listdir(data_dir) 35 | self.filenames = [os.path.join(data_dir, f) for f in self.filenames if f.endswith('.jpg')] 36 | 37 | self.labels = [int(os.path.split(filename)[-1][0]) for filename in self.filenames] 38 | self.transform = transform 39 | 40 | def __len__(self): 41 | # return size of dataset 42 | return len(self.filenames) 43 | 44 | def __getitem__(self, idx): 45 | """ 46 | Fetch index idx image and labels from dataset. Perform transforms on image. 47 | 48 | Args: 49 | idx: (int) index in [0, 1, ..., size_of_dataset-1] 50 | 51 | Returns: 52 | image: (Tensor) transformed image 53 | label: (int) corresponding label of image 54 | """ 55 | image = Image.open(self.filenames[idx]) # PIL image 56 | image = self.transform(image) 57 | return image, self.labels[idx] 58 | 59 | 60 | def fetch_dataloader(types, data_dir, params): 61 | """ 62 | Fetches the DataLoader object for each type in types from data_dir. 63 | 64 | Args: 65 | types: (list) has one or more of 'train', 'val', 'test' depending on which data is required 66 | data_dir: (string) directory containing the dataset 67 | params: (Params) hyperparameters 68 | 69 | Returns: 70 | data: (dict) contains the DataLoader object for each type in types 71 | """ 72 | dataloaders = {} 73 | 74 | for split in ['train', 'val', 'test']: 75 | if split in types: 76 | path = os.path.join(data_dir, "{}_signs".format(split)) 77 | 78 | # use the train_transformer if training data, else use eval_transformer without random flip 79 | if split == 'train': 80 | dl = DataLoader(SIGNSDataset(path, train_transformer), batch_size=params.batch_size, shuffle=True, 81 | num_workers=params.num_workers, 82 | pin_memory=params.cuda) 83 | else: 84 | dl = DataLoader(SIGNSDataset(path, eval_transformer), batch_size=params.batch_size, shuffle=False, 85 | num_workers=params.num_workers, 86 | pin_memory=params.cuda) 87 | 88 | dataloaders[split] = dl 89 | 90 | return dataloaders 91 | -------------------------------------------------------------------------------- /pytorch/vision/model/net.py: -------------------------------------------------------------------------------- 1 | """Defines the neural network, losss function and metrics""" 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | 9 | class Net(nn.Module): 10 | """ 11 | This is the standard way to define your own network in PyTorch. You typically choose the components 12 | (e.g. LSTMs, linear layers etc.) of your network in the __init__ function. You then apply these layers 13 | on the input step-by-step in the forward function. You can use torch.nn.functional to apply functions 14 | 15 | such as F.relu, F.sigmoid, F.softmax, F.max_pool2d. Be careful to ensure your dimensions are correct after each 16 | step. You are encouraged to have a look at the network in pytorch/nlp/model/net.py to get a better sense of how 17 | you can go about defining your own network. 18 | 19 | The documentation for all the various components available o you is here: http://pytorch.org/docs/master/nn.html 20 | """ 21 | 22 | def __init__(self, params): 23 | """ 24 | We define an convolutional network that predicts the sign from an image. The components 25 | required are: 26 | 27 | - an embedding layer: this layer maps each index in range(params.vocab_size) to a params.embedding_dim vector 28 | - lstm: applying the LSTM on the sequential input returns an output for each token in the sentence 29 | - fc: a fully connected layer that converts the LSTM output for each token to a distribution over NER tags 30 | 31 | Args: 32 | params: (Params) contains num_channels 33 | """ 34 | super(Net, self).__init__() 35 | self.num_channels = params.num_channels 36 | 37 | # each of the convolution layers below have the arguments (input_channels, output_channels, filter_size, 38 | # stride, padding). We also include batch normalisation layers that help stabilise training. 39 | # For more details on how to use these layers, check out the documentation. 40 | self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1) 41 | self.bn1 = nn.BatchNorm2d(self.num_channels) 42 | self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1) 43 | self.bn2 = nn.BatchNorm2d(self.num_channels*2) 44 | self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1) 45 | self.bn3 = nn.BatchNorm2d(self.num_channels*4) 46 | 47 | # 2 fully connected layers to transform the output of the convolution layers to the final output 48 | self.fc1 = nn.Linear(8*8*self.num_channels*4, self.num_channels*4) 49 | self.fcbn1 = nn.BatchNorm1d(self.num_channels*4) 50 | self.fc2 = nn.Linear(self.num_channels*4, 6) 51 | self.dropout_rate = params.dropout_rate 52 | 53 | def forward(self, s): 54 | """ 55 | This function defines how we use the components of our network to operate on an input batch. 56 | 57 | Args: 58 | s: (Variable) contains a batch of images, of dimension batch_size x 3 x 64 x 64 . 59 | 60 | Returns: 61 | out: (Variable) dimension batch_size x 6 with the log probabilities for the labels of each image. 62 | 63 | Note: the dimensions after each step are provided 64 | """ 65 | # -> batch_size x 3 x 64 x 64 66 | # we apply the convolution layers, followed by batch normalisation, maxpool and relu x 3 67 | s = self.bn1(self.conv1(s)) # batch_size x num_channels x 64 x 64 68 | s = F.relu(F.max_pool2d(s, 2)) # batch_size x num_channels x 32 x 32 69 | s = self.bn2(self.conv2(s)) # batch_size x num_channels*2 x 32 x 32 70 | s = F.relu(F.max_pool2d(s, 2)) # batch_size x num_channels*2 x 16 x 16 71 | s = self.bn3(self.conv3(s)) # batch_size x num_channels*4 x 16 x 16 72 | s = F.relu(F.max_pool2d(s, 2)) # batch_size x num_channels*4 x 8 x 8 73 | 74 | # flatten the output for each image 75 | s = s.view(-1, 8*8*self.num_channels*4) # batch_size x 8*8*num_channels*4 76 | 77 | # apply 2 fully connected layers with dropout 78 | s = F.dropout(F.relu(self.fcbn1(self.fc1(s))), 79 | p=self.dropout_rate, training=self.training) # batch_size x self.num_channels*4 80 | s = self.fc2(s) # batch_size x 6 81 | 82 | # apply log softmax on each image's output (this is recommended over applying softmax 83 | # since it is numerically more stable) 84 | return F.log_softmax(s, dim=1) 85 | 86 | 87 | def loss_fn(outputs, labels): 88 | """ 89 | Compute the cross entropy loss given outputs and labels. 90 | 91 | Args: 92 | outputs: (Variable) dimension batch_size x 6 - output of the model 93 | labels: (Variable) dimension batch_size, where each element is a value in [0, 1, 2, 3, 4, 5] 94 | 95 | Returns: 96 | loss (Variable): cross entropy loss for all images in the batch 97 | 98 | Note: you may use a standard loss function from http://pytorch.org/docs/master/nn.html#loss-functions. This example 99 | demonstrates how you can easily define a custom loss function. 100 | """ 101 | num_examples = outputs.size()[0] 102 | return -torch.sum(outputs[range(num_examples), labels])/num_examples 103 | 104 | 105 | def accuracy(outputs, labels): 106 | """ 107 | Compute the accuracy, given the outputs and labels for all images. 108 | 109 | Args: 110 | outputs: (np.ndarray) dimension batch_size x 6 - log softmax output of the model 111 | labels: (np.ndarray) dimension batch_size, where each element is a value in [0, 1, 2, 3, 4, 5] 112 | 113 | Returns: (float) accuracy in [0,1] 114 | """ 115 | outputs = np.argmax(outputs, axis=1) 116 | return np.sum(outputs==labels)/float(labels.size) 117 | 118 | 119 | # maintain all metrics required in this dictionary- these are used in the training and evaluation loops 120 | metrics = { 121 | 'accuracy': accuracy, 122 | # could add more metrics such as accuracy for each token type 123 | } 124 | -------------------------------------------------------------------------------- /pytorch/vision/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | Pillow 3 | torch>=1.2 4 | tabulate 5 | tqdm 6 | torchvision 7 | -------------------------------------------------------------------------------- /pytorch/vision/search_hyperparams.py: -------------------------------------------------------------------------------- 1 | """Peform hyperparemeters search""" 2 | 3 | import argparse 4 | import os 5 | from subprocess import check_call 6 | import sys 7 | 8 | import utils 9 | 10 | 11 | PYTHON = sys.executable 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--parent_dir', default='experiments/learning_rate', 14 | help='Directory containing params.json') 15 | parser.add_argument('--data_dir', default='data/64x64_SIGNS', help="Directory containing the dataset") 16 | 17 | 18 | def launch_training_job(parent_dir, data_dir, job_name, params): 19 | """Launch training of the model with a set of hyperparameters in parent_dir/job_name 20 | 21 | Args: 22 | model_dir: (string) directory containing config, weights and log 23 | data_dir: (string) directory containing the dataset 24 | params: (dict) containing hyperparameters 25 | """ 26 | # Create a new folder in parent_dir with unique_name "job_name" 27 | model_dir = os.path.join(parent_dir, job_name) 28 | if not os.path.exists(model_dir): 29 | os.makedirs(model_dir) 30 | 31 | # Write parameters in json file 32 | json_path = os.path.join(model_dir, 'params.json') 33 | params.save(json_path) 34 | 35 | # Launch training with this config 36 | cmd = "{python} train.py --model_dir={model_dir} --data_dir {data_dir}".format(python=PYTHON, model_dir=model_dir, 37 | data_dir=data_dir) 38 | print(cmd) 39 | check_call(cmd, shell=True) 40 | 41 | 42 | if __name__ == "__main__": 43 | # Load the "reference" parameters from parent_dir json file 44 | args = parser.parse_args() 45 | json_path = os.path.join(args.parent_dir, 'params.json') 46 | assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) 47 | params = utils.Params(json_path) 48 | 49 | # Perform hypersearch over one parameter 50 | learning_rates = [1e-4, 1e-3, 1e-2] 51 | 52 | for learning_rate in learning_rates: 53 | # Modify the relevant parameter in params 54 | params.learning_rate = learning_rate 55 | 56 | # Launch job (name has to be unique) 57 | job_name = "learning_rate_{}".format(learning_rate) 58 | launch_training_job(args.parent_dir, args.data_dir, job_name, params) 59 | -------------------------------------------------------------------------------- /pytorch/vision/synthesize_results.py: -------------------------------------------------------------------------------- 1 | """Aggregates results from the metrics_eval_best_weights.json in a parent folder""" 2 | 3 | import argparse 4 | import json 5 | import os 6 | 7 | from tabulate import tabulate 8 | 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--parent_dir', default='experiments', 12 | help='Directory containing results of experiments') 13 | 14 | 15 | def aggregate_metrics(parent_dir, metrics): 16 | """Aggregate the metrics of all experiments in folder `parent_dir`. 17 | 18 | Assumes that `parent_dir` contains multiple experiments, with their results stored in 19 | `parent_dir/subdir/metrics_dev.json` 20 | 21 | Args: 22 | parent_dir: (string) path to directory containing experiments results 23 | metrics: (dict) subdir -> {'accuracy': ..., ...} 24 | """ 25 | # Get the metrics for the folder if it has results from an experiment 26 | metrics_file = os.path.join(parent_dir, 'metrics_val_best_weights.json') 27 | if os.path.isfile(metrics_file): 28 | with open(metrics_file, 'r') as f: 29 | metrics[parent_dir] = json.load(f) 30 | 31 | # Check every subdirectory of parent_dir 32 | for subdir in os.listdir(parent_dir): 33 | if not os.path.isdir(os.path.join(parent_dir, subdir)): 34 | continue 35 | else: 36 | aggregate_metrics(os.path.join(parent_dir, subdir), metrics) 37 | 38 | 39 | def metrics_to_table(metrics): 40 | # Get the headers from the first subdir. Assumes everything has the same metrics 41 | headers = metrics[list(metrics.keys())[0]].keys() 42 | table = [[subdir] + [values[h] for h in headers] for subdir, values in metrics.items()] 43 | res = tabulate(table, headers, tablefmt='pipe') 44 | 45 | return res 46 | 47 | 48 | if __name__ == "__main__": 49 | args = parser.parse_args() 50 | 51 | # Aggregate metrics from args.parent_dir directory 52 | metrics = dict() 53 | aggregate_metrics(args.parent_dir, metrics) 54 | table = metrics_to_table(metrics) 55 | 56 | # Display the table to terminal 57 | print(table) 58 | 59 | # Save results in parent_dir/results.md 60 | save_file = os.path.join(args.parent_dir, "results.md") 61 | with open(save_file, 'w') as f: 62 | f.write(table) -------------------------------------------------------------------------------- /pytorch/vision/train.py: -------------------------------------------------------------------------------- 1 | """Train the model""" 2 | 3 | import argparse 4 | import logging 5 | import os 6 | 7 | import numpy as np 8 | import torch 9 | import torch.optim as optim 10 | from torch.autograd import Variable 11 | from tqdm import tqdm 12 | 13 | import utils 14 | import model.net as net 15 | import model.data_loader as data_loader 16 | from evaluate import evaluate 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--data_dir', default='data/64x64_SIGNS', 20 | help="Directory containing the dataset") 21 | parser.add_argument('--model_dir', default='experiments/base_model', 22 | help="Directory containing params.json") 23 | parser.add_argument('--restore_file', default=None, 24 | help="Optional, name of the file in --model_dir containing weights to reload before \ 25 | training") # 'best' or 'train' 26 | 27 | 28 | def train(model, optimizer, loss_fn, dataloader, metrics, params): 29 | """Train the model on `num_steps` batches 30 | 31 | Args: 32 | model: (torch.nn.Module) the neural network 33 | optimizer: (torch.optim) optimizer for parameters of model 34 | loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch 35 | dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data 36 | metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch 37 | params: (Params) hyperparameters 38 | num_steps: (int) number of batches to train on, each of size params.batch_size 39 | """ 40 | 41 | # set model to training mode 42 | model.train() 43 | 44 | # summary for current training loop and a running average object for loss 45 | summ = [] 46 | loss_avg = utils.RunningAverage() 47 | 48 | # Use tqdm for progress bar 49 | with tqdm(total=len(dataloader)) as t: 50 | for i, (train_batch, labels_batch) in enumerate(dataloader): 51 | # move to GPU if available 52 | if params.cuda: 53 | train_batch, labels_batch = train_batch.cuda( 54 | non_blocking=True), labels_batch.cuda(non_blocking=True) 55 | # convert to torch Variables 56 | train_batch, labels_batch = Variable( 57 | train_batch), Variable(labels_batch) 58 | 59 | # compute model output and loss 60 | output_batch = model(train_batch) 61 | loss = loss_fn(output_batch, labels_batch) 62 | 63 | # clear previous gradients, compute gradients of all variables wrt loss 64 | optimizer.zero_grad() 65 | loss.backward() 66 | 67 | # performs updates using calculated gradients 68 | optimizer.step() 69 | 70 | # Evaluate summaries only once in a while 71 | if i % params.save_summary_steps == 0: 72 | # extract data from torch Variable, move to cpu, convert to numpy arrays 73 | output_batch = output_batch.data.cpu().numpy() 74 | labels_batch = labels_batch.data.cpu().numpy() 75 | 76 | # compute all metrics on this batch 77 | summary_batch = {metric: metrics[metric](output_batch, labels_batch) 78 | for metric in metrics} 79 | summary_batch['loss'] = loss.item() 80 | summ.append(summary_batch) 81 | 82 | # update the average loss 83 | loss_avg.update(loss.item()) 84 | 85 | t.set_postfix(loss='{:05.3f}'.format(loss_avg())) 86 | t.update() 87 | 88 | # compute mean of all metrics in summary 89 | metrics_mean = {metric: np.mean([x[metric] 90 | for x in summ]) for metric in summ[0]} 91 | metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) 92 | for k, v in metrics_mean.items()) 93 | logging.info("- Train metrics: " + metrics_string) 94 | 95 | 96 | def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir, 97 | restore_file=None): 98 | """Train the model and evaluate every epoch. 99 | 100 | Args: 101 | model: (torch.nn.Module) the neural network 102 | train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data 103 | val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data 104 | optimizer: (torch.optim) optimizer for parameters of model 105 | loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch 106 | metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch 107 | params: (Params) hyperparameters 108 | model_dir: (string) directory containing config, weights and log 109 | restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) 110 | """ 111 | # reload weights from restore_file if specified 112 | if restore_file is not None: 113 | restore_path = os.path.join( 114 | args.model_dir, args.restore_file + '.pth.tar') 115 | logging.info("Restoring parameters from {}".format(restore_path)) 116 | utils.load_checkpoint(restore_path, model, optimizer) 117 | 118 | best_val_acc = 0.0 119 | 120 | for epoch in range(params.num_epochs): 121 | # Run one epoch 122 | logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) 123 | 124 | # compute number of batches in one epoch (one full pass over the training set) 125 | train(model, optimizer, loss_fn, train_dataloader, metrics, params) 126 | 127 | # Evaluate for one epoch on validation set 128 | val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params) 129 | 130 | val_acc = val_metrics['accuracy'] 131 | is_best = val_acc >= best_val_acc 132 | 133 | # Save weights 134 | utils.save_checkpoint({'epoch': epoch + 1, 135 | 'state_dict': model.state_dict(), 136 | 'optim_dict': optimizer.state_dict()}, 137 | is_best=is_best, 138 | checkpoint=model_dir) 139 | 140 | # If best_eval, best_save_path 141 | if is_best: 142 | logging.info("- Found new best accuracy") 143 | best_val_acc = val_acc 144 | 145 | # Save best val metrics in a json file in the model directory 146 | best_json_path = os.path.join( 147 | model_dir, "metrics_val_best_weights.json") 148 | utils.save_dict_to_json(val_metrics, best_json_path) 149 | 150 | # Save latest val metrics in a json file in the model directory 151 | last_json_path = os.path.join( 152 | model_dir, "metrics_val_last_weights.json") 153 | utils.save_dict_to_json(val_metrics, last_json_path) 154 | 155 | 156 | if __name__ == '__main__': 157 | 158 | # Load the parameters from json file 159 | args = parser.parse_args() 160 | json_path = os.path.join(args.model_dir, 'params.json') 161 | assert os.path.isfile( 162 | json_path), "No json configuration file found at {}".format(json_path) 163 | params = utils.Params(json_path) 164 | 165 | # use GPU if available 166 | params.cuda = torch.cuda.is_available() 167 | 168 | # Set the random seed for reproducible experiments 169 | torch.manual_seed(230) 170 | if params.cuda: 171 | torch.cuda.manual_seed(230) 172 | 173 | # Set the logger 174 | utils.set_logger(os.path.join(args.model_dir, 'train.log')) 175 | 176 | # Create the input data pipeline 177 | logging.info("Loading the datasets...") 178 | 179 | # fetch dataloaders 180 | dataloaders = data_loader.fetch_dataloader( 181 | ['train', 'val'], args.data_dir, params) 182 | train_dl = dataloaders['train'] 183 | val_dl = dataloaders['val'] 184 | 185 | logging.info("- done.") 186 | 187 | # Define the model and optimizer 188 | model = net.Net(params).cuda() if params.cuda else net.Net(params) 189 | optimizer = optim.Adam(model.parameters(), lr=params.learning_rate) 190 | 191 | # fetch loss function and metrics 192 | loss_fn = net.loss_fn 193 | metrics = net.metrics 194 | 195 | # Train the model 196 | logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) 197 | train_and_evaluate(model, train_dl, val_dl, optimizer, loss_fn, metrics, params, args.model_dir, 198 | args.restore_file) 199 | -------------------------------------------------------------------------------- /pytorch/vision/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import shutil 5 | 6 | import torch 7 | 8 | class Params(): 9 | """Class that loads hyperparameters from a json file. 10 | 11 | Example: 12 | ``` 13 | params = Params(json_path) 14 | print(params.learning_rate) 15 | params.learning_rate = 0.5 # change the value of learning_rate in params 16 | ``` 17 | """ 18 | 19 | def __init__(self, json_path): 20 | with open(json_path) as f: 21 | params = json.load(f) 22 | self.__dict__.update(params) 23 | 24 | def save(self, json_path): 25 | with open(json_path, 'w') as f: 26 | json.dump(self.__dict__, f, indent=4) 27 | 28 | def update(self, json_path): 29 | """Loads parameters from json file""" 30 | with open(json_path) as f: 31 | params = json.load(f) 32 | self.__dict__.update(params) 33 | 34 | @property 35 | def dict(self): 36 | """Gives dict-like access to Params instance by `params.dict['learning_rate']""" 37 | return self.__dict__ 38 | 39 | 40 | class RunningAverage(): 41 | """A simple class that maintains the running average of a quantity 42 | 43 | Example: 44 | ``` 45 | loss_avg = RunningAverage() 46 | loss_avg.update(2) 47 | loss_avg.update(4) 48 | loss_avg() = 3 49 | ``` 50 | """ 51 | def __init__(self): 52 | self.steps = 0 53 | self.total = 0 54 | 55 | def update(self, val): 56 | self.total += val 57 | self.steps += 1 58 | 59 | def __call__(self): 60 | return self.total/float(self.steps) 61 | 62 | 63 | def set_logger(log_path): 64 | """Set the logger to log info in terminal and file `log_path`. 65 | 66 | In general, it is useful to have a logger so that every output to the terminal is saved 67 | in a permanent file. Here we save it to `model_dir/train.log`. 68 | 69 | Example: 70 | ``` 71 | logging.info("Starting training...") 72 | ``` 73 | 74 | Args: 75 | log_path: (string) where to log 76 | """ 77 | logger = logging.getLogger() 78 | logger.setLevel(logging.INFO) 79 | 80 | if not logger.handlers: 81 | # Logging to a file 82 | file_handler = logging.FileHandler(log_path) 83 | file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) 84 | logger.addHandler(file_handler) 85 | 86 | # Logging to console 87 | stream_handler = logging.StreamHandler() 88 | stream_handler.setFormatter(logging.Formatter('%(message)s')) 89 | logger.addHandler(stream_handler) 90 | 91 | 92 | def save_dict_to_json(d, json_path): 93 | """Saves dict of floats in json file 94 | 95 | Args: 96 | d: (dict) of float-castable values (np.float, int, float, etc.) 97 | json_path: (string) path to json file 98 | """ 99 | with open(json_path, 'w') as f: 100 | # We need to convert the values to float for json (it doesn't accept np.array, np.float, ) 101 | d = {k: float(v) for k, v in d.items()} 102 | json.dump(d, f, indent=4) 103 | 104 | 105 | def save_checkpoint(state, is_best, checkpoint): 106 | """Saves model and training parameters at checkpoint + 'last.pth.tar'. If is_best==True, also saves 107 | checkpoint + 'best.pth.tar' 108 | 109 | Args: 110 | state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict 111 | is_best: (bool) True if it is the best model seen till now 112 | checkpoint: (string) folder where parameters are to be saved 113 | """ 114 | filepath = os.path.join(checkpoint, 'last.pth.tar') 115 | if not os.path.exists(checkpoint): 116 | print("Checkpoint Directory does not exist! Making directory {}".format(checkpoint)) 117 | os.mkdir(checkpoint) 118 | else: 119 | print("Checkpoint Directory exists! ") 120 | torch.save(state, filepath) 121 | if is_best: 122 | shutil.copyfile(filepath, os.path.join(checkpoint, 'best.pth.tar')) 123 | 124 | 125 | def load_checkpoint(checkpoint, model, optimizer=None): 126 | """Loads model parameters (state_dict) from file_path. If optimizer is provided, loads state_dict of 127 | optimizer assuming it is present in checkpoint. 128 | 129 | Args: 130 | checkpoint: (string) filename which needs to be loaded 131 | model: (torch.nn.Module) model for which the parameters are loaded 132 | optimizer: (torch.optim) optional: resume optimizer from checkpoint 133 | """ 134 | if not os.path.exists(checkpoint): 135 | raise("File doesn't exist {}".format(checkpoint)) 136 | checkpoint = torch.load(checkpoint) 137 | model.load_state_dict(checkpoint['state_dict']) 138 | 139 | if optimizer: 140 | optimizer.load_state_dict(checkpoint['optim_dict']) 141 | 142 | return checkpoint -------------------------------------------------------------------------------- /tensorflow/nlp/README.md: -------------------------------------------------------------------------------- 1 | # Named Entity Recognition with Tensorflow 2 | 3 | _Authors: Guillaume Genthial and Olivier Moindrot_ 4 | 5 | Take the time to read the [tutorials](https://cs230-stanford.github.io). 6 | 7 | Note : all scripts must be run in `tensorflow/nlp`. 8 | 9 | ## Requirements 10 | 11 | We recommend using python3 and a virtual env. See instructions [here](https://cs230-stanford.github.io/project-starter-code.html). 12 | 13 | ``` 14 | virtualenv -p python3 .env 15 | source .env/bin/activate 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | When you're done working on the project, deactivate the virtual environment with `deactivate`. 20 | 21 | ## Task 22 | 23 | Given a sentence, give a tag to each word ([Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition)) 24 | 25 | ``` 26 | John lives in New York 27 | B-PER O O B-LOC I-LOC 28 | ``` 29 | 30 | ## [optional] Download the Kaggle dataset (~5 min) 31 | 32 | We provide a small subset of the kaggle dataset (30 sentences) for testing in `data/small` but you are encouraged to download the original version on the [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data) website. 33 | 34 | 1. **Download the dataset** `ner_dataset.csv` on [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data) and save it under the `nlp/data/kaggle` directory. Make sure you download the simple version `ner_dataset.csv` and NOT the full version `ner.csv`. 35 | 36 | 2. **Build the dataset** Run the following script 37 | 38 | ``` 39 | python build_kaggle_dataset.py 40 | ``` 41 | 42 | It will extract the sentences and labels from the dataset, split it into train / test / dev and save it in a convenient format for our model. 43 | 44 | _Debug_ If you get some errors, check that you downloaded the right file and saved it in the right directory. If you have issues with encoding, try running the script with python 2.7. 45 | 46 | 3. In the next section, change `data/small` by `data/kaggle` 47 | 48 | ## Quickstart (~10 min) 49 | 50 | 1. **Build** vocabularies and parameters for your dataset by running 51 | 52 | ``` 53 | python build_vocab.py --data_dir data/small 54 | ``` 55 | 56 | It will write vocabulary files `words.txt` and `tags.txt` containing the words and tags in the dataset. It will also save a `dataset_params.json` with some extra information. 57 | 58 | 2. **Your first experiment** We created a `base_model` directory for you under the `experiments` directory. It countains a file `params.json` which sets the parameters for the experiment. It looks like 59 | 60 | ```json 61 | { 62 | "learning_rate": 1e-3, 63 | "batch_size": 5, 64 | "num_epochs": 2 65 | } 66 | ``` 67 | 68 | For every new experiment, you will need to create a new directory under `experiments` with a `params.json` file. 69 | 70 | 3. **Train** your experiment. Simply run 71 | 72 | ``` 73 | python train.py --data_dir data/small --model_dir experiments/base_model 74 | ``` 75 | 76 | It will instantiate a model and train it on the training set following the parameters specified in `params.json`. It will also evaluate some metrics on the development set. 77 | 78 | 4. **Your first hyperparameters search** We created a new directory `learning_rate` in `experiments` for you. Now, run 79 | 80 | ``` 81 | python search_hyperparams.py --data_dir data/small --parent_dir experiments/learning_rate 82 | ``` 83 | 84 | It will train and evaluate a model with different values of learning rate defined in `search_hyperparams.py` and create a new directory for each experiment under `experiments/learning_rate/`. 85 | 86 | 5. **Display the results** of the hyperparameters search in a nice format 87 | 88 | ``` 89 | python synthesize_results.py --parent_dir experiments/learning_rate 90 | ``` 91 | 92 | 6. **Evaluation on the test set** Once you've run many experiments and selected your best model and hyperparameters based on the performance on the development set, you can finally evaluate the performance of your model on the test set. Run 93 | 94 | ``` 95 | python evaluate.py --data_dir data/small --model_dir experiments/base_model 96 | ``` 97 | 98 | ## Guidelines for more advanced use 99 | 100 | We recommend reading through `train.py` to get a high-level overview of the steps: 101 | 102 | - loading the parameters for the experiment (the `params.json`) and for the dataset (the `dataset_params.json`) 103 | - loading the vocabularies from the `words.txt` and `tags.txt` files. 104 | - creating the sentences / labels datasets (`tf.data.TextLineDataset` instances reading the files and replacing tokens by their ids) 105 | - creating the input of our model by zipping the sentences and labels together (`input_fn(...)`), as well as performing batching and padding (for sentences of different length). 106 | - creating the model (=nodes / ops of the `tf.Graph()`) by calling `model_fn(...)` 107 | - training the model for a given number of epochs by calling `train_and_evaluate(...)` 108 | 109 | Once you get the high-level idea, depending on your dataset, you might want to modify 110 | 111 | - `model/model_fn.py` to change the model 112 | - `model/input_fn.py` to change the way you read data / change the way you combine your different files 113 | - `train.py` and `evaluate.py` to change the story-line (maybe you need another vocabulary, etc.) 114 | 115 | If you want to compute new metrics for which you can find a [tensorflow implementation](https://www.tensorflow.org/api_docs/python/tf/metrics), you can define it in the `model_fn.py` (add it to the `metrics` dictionnary). It will automatically be updated during the training and will be displayed at the end of each epoch. 116 | 117 | Once you get something working for your dataset, feel free to edit any part of the code to suit your own needs. 118 | 119 | ## Resources 120 | 121 | Note that this repository uses Tensorflow 1.14. Tensorflow 2 has just been 122 | released, so the links below now point to Tensorflow 2 documentation. You can 123 | navigate to the old 1.14 docs through the API dropdown on navigation bar. 124 | 125 | There are major changes between TF 1 and TF 2, most notably Eager Execution 126 | being the new default mode. If your team is starting with a new project, we 127 | recommend using Tensorflow 2. 128 | 129 | Introduction to the `tf.data` pipeline 130 | 131 | - [programmer's guide](https://www.tensorflow.org/programmers_guide/datasets) 132 | - [consuming text data](https://www.tensorflow.org/programmers_guide/datasets#consuming_text_data) 133 | 134 | Tensorflow seq2seq using the tf.data pipeline: 135 | 136 | - [documentation](https://www.tensorflow.org/tutorials/seq2seq) 137 | - [github](https://github.com/tensorflow/nmt/) 138 | -------------------------------------------------------------------------------- /tensorflow/nlp/build_kaggle_dataset.py: -------------------------------------------------------------------------------- 1 | """Read, split and save the kaggle dataset for our model""" 2 | 3 | import csv 4 | import os 5 | import sys 6 | 7 | 8 | def load_dataset(path_csv): 9 | """Loads dataset into memory from csv file""" 10 | # Open the csv file, need to specify the encoding for python3 11 | use_python3 = sys.version_info[0] >= 3 12 | with (open(path_csv, encoding="windows-1252") if use_python3 else open(path_csv)) as f: 13 | csv_file = csv.reader(f, delimiter=',') 14 | dataset = [] 15 | words, tags = [], [] 16 | 17 | # Each line of the csv corresponds to one word 18 | for idx, row in enumerate(csv_file): 19 | if idx == 0: continue 20 | sentence, word, pos, tag = row 21 | # If the first column is non empty it means we reached a new sentence 22 | if len(sentence) != 0: 23 | if len(words) > 0: 24 | assert len(words) == len(tags) 25 | dataset.append((words, tags)) 26 | words, tags = [], [] 27 | try: 28 | word, tag = str(word), str(tag) 29 | words.append(word) 30 | tags.append(tag) 31 | except UnicodeDecodeError as e: 32 | print("An exception was raised, skipping a word: {}".format(e)) 33 | pass 34 | 35 | return dataset 36 | 37 | 38 | def save_dataset(dataset, save_dir): 39 | """Writes sentences.txt and labels.txt files in save_dir from dataset 40 | 41 | Args: 42 | dataset: ([(["a", "cat"], ["O", "O"]), ...]) 43 | save_dir: (string) 44 | """ 45 | # Create directory if it doesn't exist 46 | print("Saving in {}...".format(save_dir)) 47 | if not os.path.exists(save_dir): 48 | os.makedirs(save_dir) 49 | 50 | # Export the dataset 51 | with open(os.path.join(save_dir, 'sentences.txt'), 'w') as file_sentences: 52 | with open(os.path.join(save_dir, 'labels.txt'), 'w') as file_labels: 53 | for words, tags in dataset: 54 | file_sentences.write("{}\n".format(" ".join(words))) 55 | file_labels.write("{}\n".format(" ".join(tags))) 56 | print("- done.") 57 | 58 | 59 | if __name__ == "__main__": 60 | # Check that the dataset exists (you need to make sure you haven't downloaded the `ner.csv`) 61 | path_dataset = 'data/kaggle/ner_dataset.csv' 62 | msg = "{} file not found. Make sure you have downloaded the right dataset".format(path_dataset) 63 | assert os.path.isfile(path_dataset), msg 64 | 65 | # Load the dataset into memory 66 | print("Loading Kaggle dataset into memory...") 67 | dataset = load_dataset(path_dataset) 68 | print("- done.") 69 | 70 | # Split the dataset into train, dev and split (dummy split with no shuffle) 71 | train_dataset = dataset[:int(0.7*len(dataset))] 72 | dev_dataset = dataset[int(0.7*len(dataset)) : int(0.85*len(dataset))] 73 | test_dataset = dataset[int(0.85*len(dataset)):] 74 | 75 | # Save the datasets to files 76 | save_dataset(train_dataset, 'data/kaggle/train') 77 | save_dataset(dev_dataset, 'data/kaggle/dev') 78 | save_dataset(test_dataset, 'data/kaggle/test') -------------------------------------------------------------------------------- /tensorflow/nlp/build_vocab.py: -------------------------------------------------------------------------------- 1 | """Build vocabularies of words and tags from datasets""" 2 | 3 | import argparse 4 | from collections import Counter 5 | import json 6 | import os 7 | import sys 8 | 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--min_count_word', default=1, help="Minimum count for words in the dataset", 12 | type=int) 13 | parser.add_argument('--min_count_tag', default=1, help="Minimum count for tags in the dataset", 14 | type=int) 15 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset") 16 | 17 | # Hyper parameters for the vocab 18 | NUM_OOV_BUCKETS = 1 # number of buckets (= number of ids) for unknown words 19 | PAD_WORD = '' 20 | PAD_TAG = 'O' 21 | 22 | 23 | def save_vocab_to_txt_file(vocab, txt_path): 24 | """Writes one token per line, 0-based line id corresponds to the id of the token. 25 | 26 | Args: 27 | vocab: (iterable object) yields token 28 | txt_path: (stirng) path to vocab file 29 | """ 30 | with open(txt_path, "w") as f: 31 | f.write("\n".join(token for token in vocab)) 32 | 33 | 34 | def save_dict_to_json(d, json_path): 35 | """Saves dict to json file 36 | 37 | Args: 38 | d: (dict) 39 | json_path: (string) path to json file 40 | """ 41 | with open(json_path, 'w') as f: 42 | d = {k: v for k, v in d.items()} 43 | json.dump(d, f, indent=4) 44 | 45 | 46 | def update_vocab(txt_path, vocab): 47 | """Update word and tag vocabulary from dataset 48 | 49 | Args: 50 | txt_path: (string) path to file, one sentence per line 51 | vocab: (dict or Counter) with update method 52 | 53 | Returns: 54 | dataset_size: (int) number of elements in the dataset 55 | """ 56 | with open(txt_path) as f: 57 | for i, line in enumerate(f): 58 | vocab.update(line.strip().split(' ')) 59 | 60 | 61 | return i + 1 62 | 63 | 64 | if __name__ == '__main__': 65 | args = parser.parse_args() 66 | 67 | # Build word vocab with train and test datasets 68 | print("Building word vocabulary...") 69 | words = Counter() 70 | size_train_sentences = update_vocab(os.path.join(args.data_dir, 'train/sentences.txt'), words) 71 | size_dev_sentences = update_vocab(os.path.join(args.data_dir, 'dev/sentences.txt'), words) 72 | size_test_sentences = update_vocab(os.path.join(args.data_dir, 'test/sentences.txt'), words) 73 | print("- done.") 74 | 75 | # Build tag vocab with train and test datasets 76 | print("Building tag vocabulary...") 77 | tags = Counter() 78 | size_train_tags = update_vocab(os.path.join(args.data_dir, 'train/labels.txt'), tags) 79 | size_dev_tags = update_vocab(os.path.join(args.data_dir, 'dev/labels.txt'), tags) 80 | size_test_tags = update_vocab(os.path.join(args.data_dir, 'test/labels.txt'), tags) 81 | print("- done.") 82 | 83 | # Assert same number of examples in datasets 84 | assert size_train_sentences == size_train_tags 85 | assert size_dev_sentences == size_dev_tags 86 | assert size_test_sentences == size_test_tags 87 | 88 | # Only keep most frequent tokens 89 | words = [tok for tok, count in words.items() if count >= args.min_count_word] 90 | tags = [tok for tok, count in tags.items() if count >= args.min_count_tag] 91 | 92 | # Add pad tokens 93 | if PAD_WORD not in words: words.append(PAD_WORD) 94 | if PAD_TAG not in tags: tags.append(PAD_TAG) 95 | 96 | # Save vocabularies to file 97 | print("Saving vocabularies to file...") 98 | save_vocab_to_txt_file(words, os.path.join(args.data_dir, 'words.txt')) 99 | save_vocab_to_txt_file(tags, os.path.join(args.data_dir, 'tags.txt')) 100 | print("- done.") 101 | 102 | # Save datasets properties in json file 103 | sizes = { 104 | 'train_size': size_train_sentences, 105 | 'dev_size': size_dev_sentences, 106 | 'test_size': size_test_sentences, 107 | 'vocab_size': len(words) + NUM_OOV_BUCKETS, 108 | 'number_of_tags': len(tags), 109 | 'pad_word': PAD_WORD, 110 | 'pad_tag': PAD_TAG, 111 | 'num_oov_buckets': NUM_OOV_BUCKETS 112 | } 113 | save_dict_to_json(sizes, os.path.join(args.data_dir, 'dataset_params.json')) 114 | 115 | # Logging sizes 116 | to_print = "\n".join("- {}: {}".format(k, v) for k, v in sizes.items()) 117 | print("Characteristics of the dataset:\n{}".format(to_print)) -------------------------------------------------------------------------------- /tensorflow/nlp/data/kaggle/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/nlp/data/kaggle/.gitkeep -------------------------------------------------------------------------------- /tensorflow/nlp/data/small/dev/labels.txt: -------------------------------------------------------------------------------- 1 | B-PER O O B-LOC I-LOC 2 | B-PER O O B-LOC 3 | B-PER O O B-LOC 4 | B-PER O O B-LOC 5 | B-PER O O B-LOC 6 | B-PER O O B-LOC 7 | B-PER O O B-LOC 8 | B-PER O O B-LOC 9 | B-PER O O B-LOC 10 | B-PER O O B-LOC -------------------------------------------------------------------------------- /tensorflow/nlp/data/small/dev/sentences.txt: -------------------------------------------------------------------------------- 1 | John lives in New York 2 | Kate lives in London 3 | Ziang lives in Beijing 4 | Pierre lives in Paris 5 | Dominik lives in Berlin 6 | Raul lives in Mexico 7 | Sergio lives in Rome 8 | Alexandr lives in Moscow 9 | Ines lives in Casablanca 10 | Jack lives in San Francisco -------------------------------------------------------------------------------- /tensorflow/nlp/data/small/test/labels.txt: -------------------------------------------------------------------------------- 1 | B-PER O O B-LOC I-LOC 2 | B-PER O O B-LOC 3 | B-PER O O B-LOC 4 | B-PER O O B-LOC 5 | B-PER O O B-LOC 6 | B-PER O O B-LOC 7 | B-PER O O B-LOC 8 | B-PER O O B-LOC 9 | B-PER O O B-LOC 10 | B-PER O O B-LOC -------------------------------------------------------------------------------- /tensorflow/nlp/data/small/test/sentences.txt: -------------------------------------------------------------------------------- 1 | John lives in New York 2 | Kate lives in London 3 | Ziang lives in Beijing 4 | Pierre lives in Paris 5 | Dominik lives in Berlin 6 | Raul lives in Mexico 7 | Sergio lives in Rome 8 | Alexandr lives in Moscow 9 | Ines lives in Casablanca 10 | Jack lives in San Francisco -------------------------------------------------------------------------------- /tensorflow/nlp/data/small/train/labels.txt: -------------------------------------------------------------------------------- 1 | B-PER O O B-LOC I-LOC 2 | B-PER O O B-LOC 3 | B-PER O O B-LOC 4 | B-PER O O B-LOC 5 | B-PER O O B-LOC 6 | B-PER O O B-LOC 7 | B-PER O O B-LOC 8 | B-PER O O B-LOC 9 | B-PER O O B-LOC 10 | B-PER O O B-LOC -------------------------------------------------------------------------------- /tensorflow/nlp/data/small/train/sentences.txt: -------------------------------------------------------------------------------- 1 | John lives in New York 2 | Kate lives in London 3 | Ziang lives in Beijing 4 | Pierre lives in Paris 5 | Dominik lives in Berlin 6 | Raul lives in Mexico 7 | Sergio lives in Rome 8 | Alexandr lives in Moscow 9 | Ines lives in Casablanca 10 | Jack lives in San Francisco -------------------------------------------------------------------------------- /tensorflow/nlp/evaluate.py: -------------------------------------------------------------------------------- 1 | """Evaluate the model""" 2 | 3 | import argparse 4 | import logging 5 | import os 6 | 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | from model.utils import Params 11 | from model.utils import set_logger 12 | from model.evaluation import evaluate 13 | from model.input_fn import input_fn 14 | from model.input_fn import load_dataset_from_text 15 | from model.model_fn import model_fn 16 | 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--model_dir', default='experiments/base_model', 20 | help="Directory containing params.json") 21 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset") 22 | parser.add_argument('--restore_from', default='best_weights', 23 | help="Subdirectory of model dir or file containing the weights") 24 | 25 | if __name__ == '__main__': 26 | # Set the random seed for the whole graph 27 | tf.set_random_seed(230) 28 | 29 | # Load the parameters 30 | args = parser.parse_args() 31 | json_path = os.path.join(args.model_dir, 'params.json') 32 | assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) 33 | params = Params(json_path) 34 | 35 | # Load the parameters from the dataset, that gives the size etc. into params 36 | json_path = os.path.join(args.data_dir, 'dataset_params.json') 37 | assert os.path.isfile(json_path), "No json file found at {}, run build.py".format(json_path) 38 | params.update(json_path) 39 | num_oov_buckets = params.num_oov_buckets # number of buckets for unknown words 40 | 41 | # Set the logger 42 | set_logger(os.path.join(args.model_dir, 'evaluate.log')) 43 | 44 | # Get paths for vocabularies and dataset 45 | path_words = os.path.join(args.data_dir, 'words.txt') 46 | path_tags = os.path.join(args.data_dir, 'tags.txt') 47 | path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt') 48 | path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt') 49 | 50 | # Load Vocabularies 51 | words = tf.contrib.lookup.index_table_from_file(path_words, num_oov_buckets=num_oov_buckets) 52 | tags = tf.contrib.lookup.index_table_from_file(path_tags) 53 | 54 | # Create the input data pipeline 55 | logging.info("Creating the dataset...") 56 | test_sentences = load_dataset_from_text(path_eval_sentences, words) 57 | test_labels = load_dataset_from_text(path_eval_labels, tags) 58 | 59 | # Specify other parameters for the dataset and the model 60 | params.eval_size = params.test_size 61 | params.id_pad_word = words.lookup(tf.constant(params.pad_word)) 62 | params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag)) 63 | 64 | # Create iterator over the test set 65 | inputs = input_fn('eval', test_sentences, test_labels, params) 66 | logging.info("- done.") 67 | 68 | # Define the model 69 | logging.info("Creating the model...") 70 | model_spec = model_fn('eval', inputs, params, reuse=False) 71 | logging.info("- done.") 72 | 73 | logging.info("Starting evaluation") 74 | evaluate(model_spec, args.model_dir, params, args.restore_from) 75 | -------------------------------------------------------------------------------- /tensorflow/nlp/experiments/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/nlp/experiments/.gitkeep -------------------------------------------------------------------------------- /tensorflow/nlp/experiments/base_model/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_version": "lstm", 3 | "lstm_num_units": 50, 4 | "embedding_size": 50, 5 | 6 | "learning_rate": 1e-3, 7 | "batch_size": 32, 8 | "num_epochs": 10, 9 | "dropout_rate": 0.3, 10 | 11 | "save_summary_steps": 100 12 | } 13 | -------------------------------------------------------------------------------- /tensorflow/nlp/experiments/learning_rate/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_version": "lstm", 3 | "lstm_num_units": 50, 4 | "embedding_size": 50, 5 | 6 | "learning_rate": 1e-3, 7 | "batch_size": 32, 8 | "num_epochs": 2, 9 | "dropout_rate": 0.3, 10 | 11 | "save_summary_steps": 100 12 | } 13 | -------------------------------------------------------------------------------- /tensorflow/nlp/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/nlp/model/__init__.py -------------------------------------------------------------------------------- /tensorflow/nlp/model/evaluation.py: -------------------------------------------------------------------------------- 1 | """Tensorflow utility functions for evaluation""" 2 | 3 | import logging 4 | import os 5 | 6 | from tqdm import trange 7 | import tensorflow as tf 8 | 9 | from model.utils import save_dict_to_json 10 | 11 | 12 | def evaluate_sess(sess, model_spec, num_steps, writer=None, params=None): 13 | """Train the model on `num_steps` batches. 14 | 15 | Args: 16 | sess: (tf.Session) current session 17 | model_spec: (dict) contains the graph operations or nodes needed for training 18 | num_steps: (int) train for this number of batches 19 | writer: (tf.summary.FileWriter) writer for summaries. Is None if we don't log anything 20 | params: (Params) hyperparameters 21 | """ 22 | update_metrics = model_spec['update_metrics'] 23 | eval_metrics = model_spec['metrics'] 24 | global_step = tf.train.get_global_step() 25 | 26 | # Load the evaluation dataset into the pipeline and initialize the metrics init op 27 | sess.run(model_spec['iterator_init_op']) 28 | sess.run(model_spec['metrics_init_op']) 29 | 30 | # compute metrics over the dataset 31 | for _ in range(num_steps): 32 | sess.run(update_metrics) 33 | 34 | # Get the values of the metrics 35 | metrics_values = {k: v[0] for k, v in eval_metrics.items()} 36 | metrics_val = sess.run(metrics_values) 37 | metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_val.items()) 38 | logging.info("- Eval metrics: " + metrics_string) 39 | 40 | # Add summaries manually to writer at global_step_val 41 | if writer is not None: 42 | global_step_val = sess.run(global_step) 43 | for tag, val in metrics_val.items(): 44 | summ = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)]) 45 | writer.add_summary(summ, global_step_val) 46 | 47 | return metrics_val 48 | 49 | 50 | def evaluate(model_spec, model_dir, params, restore_from): 51 | """Evaluate the model 52 | 53 | Args: 54 | model_spec: (dict) contains the graph operations or nodes needed for evaluation 55 | model_dir: (string) directory containing config, weights and log 56 | params: (Params) contains hyperparameters of the model. 57 | Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps 58 | restore_from: (string) directory or file containing weights to restore the graph 59 | """ 60 | # Initialize tf.Saver 61 | saver = tf.train.Saver() 62 | 63 | with tf.Session() as sess: 64 | # Initialize the lookup table 65 | sess.run(model_spec['variable_init_op']) 66 | 67 | # Reload weights from the weights subdirectory 68 | save_path = os.path.join(model_dir, restore_from) 69 | if os.path.isdir(save_path): 70 | save_path = tf.train.latest_checkpoint(save_path) 71 | saver.restore(sess, save_path) 72 | 73 | # Evaluate 74 | num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size 75 | metrics = evaluate_sess(sess, model_spec, num_steps) 76 | metrics_name = '_'.join(restore_from.split('/')) 77 | save_path = os.path.join(model_dir, "metrics_test_{}.json".format(metrics_name)) 78 | save_dict_to_json(metrics, save_path) 79 | -------------------------------------------------------------------------------- /tensorflow/nlp/model/input_fn.py: -------------------------------------------------------------------------------- 1 | """Create the input data pipeline using `tf.data`""" 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def load_dataset_from_text(path_txt, vocab): 7 | """Create tf.data Instance from txt file 8 | 9 | Args: 10 | path_txt: (string) path containing one example per line 11 | vocab: (tf.lookuptable) 12 | 13 | Returns: 14 | dataset: (tf.Dataset) yielding list of ids of tokens for each example 15 | """ 16 | # Load txt file, one example per line 17 | dataset = tf.data.TextLineDataset(path_txt) 18 | 19 | # Convert line into list of tokens, splitting by white space 20 | dataset = dataset.map(lambda string: tf.string_split([string]).values) 21 | 22 | # Lookup tokens to return their ids 23 | dataset = dataset.map(lambda tokens: (vocab.lookup(tokens), tf.size(tokens))) 24 | 25 | return dataset 26 | 27 | 28 | def input_fn(mode, sentences, labels, params): 29 | """Input function for NER 30 | 31 | Args: 32 | mode: (string) 'train', 'eval' or any other mode you can think of 33 | At training, we shuffle the data and have multiple epochs 34 | sentences: (tf.Dataset) yielding list of ids of words 35 | datasets: (tf.Dataset) yielding list of ids of tags 36 | params: (Params) contains hyperparameters of the model (ex: `params.num_epochs`) 37 | 38 | """ 39 | # Load all the dataset in memory for shuffling is training 40 | is_training = (mode == 'train') 41 | buffer_size = params.buffer_size if is_training else 1 42 | 43 | # Zip the sentence and the labels together 44 | dataset = tf.data.Dataset.zip((sentences, labels)) 45 | 46 | # Create batches and pad the sentences of different length 47 | padded_shapes = ((tf.TensorShape([None]), # sentence of unknown size 48 | tf.TensorShape([])), # size(words) 49 | (tf.TensorShape([None]), # labels of unknown size 50 | tf.TensorShape([]))) # size(tags) 51 | 52 | padding_values = ((params.id_pad_word, # sentence padded on the right with id_pad_word 53 | 0), # size(words) -- unused 54 | (params.id_pad_tag, # labels padded on the right with id_pad_tag 55 | 0)) # size(tags) -- unused 56 | 57 | 58 | dataset = (dataset 59 | .shuffle(buffer_size=buffer_size) 60 | .padded_batch(params.batch_size, padded_shapes=padded_shapes, padding_values=padding_values) 61 | .prefetch(1) # make sure you always have one batch ready to serve 62 | ) 63 | 64 | # Create initializable iterator from this dataset so that we can reset at each epoch 65 | iterator = dataset.make_initializable_iterator() 66 | 67 | # Query the output of the iterator for input to the model 68 | ((sentence, sentence_lengths), (labels, _)) = iterator.get_next() 69 | init_op = iterator.initializer 70 | 71 | # Build and return a dictionnary containing the nodes / ops 72 | inputs = { 73 | 'sentence': sentence, 74 | 'labels': labels, 75 | 'sentence_lengths': sentence_lengths, 76 | 'iterator_init_op': init_op 77 | } 78 | 79 | return inputs 80 | -------------------------------------------------------------------------------- /tensorflow/nlp/model/model_fn.py: -------------------------------------------------------------------------------- 1 | """Define the model.""" 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def build_model(mode, inputs, params): 7 | """Compute logits of the model (output distribution) 8 | 9 | Args: 10 | mode: (string) 'train', 'eval', etc. 11 | inputs: (dict) contains the inputs of the graph (features, labels...) 12 | this can be `tf.placeholder` or outputs of `tf.data` 13 | params: (Params) contains hyperparameters of the model (ex: `params.learning_rate`) 14 | 15 | Returns: 16 | output: (tf.Tensor) output of the model 17 | """ 18 | sentence = inputs['sentence'] 19 | 20 | if params.model_version == 'lstm': 21 | # Get word embeddings for each token in the sentence 22 | embeddings = tf.get_variable(name="embeddings", dtype=tf.float32, 23 | shape=[params.vocab_size, params.embedding_size]) 24 | sentence = tf.nn.embedding_lookup(embeddings, sentence) 25 | 26 | # Apply LSTM over the embeddings 27 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(params.lstm_num_units) 28 | output, _ = tf.nn.dynamic_rnn(lstm_cell, sentence, dtype=tf.float32) 29 | 30 | # Compute logits from the output of the LSTM 31 | logits = tf.layers.dense(output, params.number_of_tags) 32 | 33 | else: 34 | raise NotImplementedError("Unknown model version: {}".format(params.model_version)) 35 | 36 | return logits 37 | 38 | 39 | def model_fn(mode, inputs, params, reuse=False): 40 | """Model function defining the graph operations. 41 | 42 | Args: 43 | mode: (string) 'train', 'eval', etc. 44 | inputs: (dict) contains the inputs of the graph (features, labels...) 45 | this can be `tf.placeholder` or outputs of `tf.data` 46 | params: (Params) contains hyperparameters of the model (ex: `params.learning_rate`) 47 | reuse: (bool) whether to reuse the weights 48 | 49 | Returns: 50 | model_spec: (dict) contains the graph operations or nodes needed for training / evaluation 51 | """ 52 | is_training = (mode == 'train') 53 | labels = inputs['labels'] 54 | sentence_lengths = inputs['sentence_lengths'] 55 | 56 | # ----------------------------------------------------------- 57 | # MODEL: define the layers of the model 58 | with tf.variable_scope('model', reuse=reuse): 59 | # Compute the output distribution of the model and the predictions 60 | logits = build_model(mode, inputs, params) 61 | predictions = tf.argmax(logits, -1) 62 | 63 | # Define loss and accuracy (we need to apply a mask to account for padding) 64 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels) 65 | mask = tf.sequence_mask(sentence_lengths) 66 | losses = tf.boolean_mask(losses, mask) 67 | loss = tf.reduce_mean(losses) 68 | accuracy = tf.reduce_mean(tf.cast(tf.equal(labels, predictions), tf.float32)) 69 | 70 | # Define training step that minimizes the loss with the Adam optimizer 71 | if is_training: 72 | optimizer = tf.train.AdamOptimizer(params.learning_rate) 73 | global_step = tf.train.get_or_create_global_step() 74 | train_op = optimizer.minimize(loss, global_step=global_step) 75 | 76 | # ----------------------------------------------------------- 77 | # METRICS AND SUMMARIES 78 | # Metrics for evaluation using tf.metrics (average over whole dataset) 79 | with tf.variable_scope("metrics"): 80 | metrics = { 81 | 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions), 82 | 'loss': tf.metrics.mean(loss) 83 | } 84 | 85 | # Group the update ops for the tf.metrics 86 | update_metrics_op = tf.group(*[op for _, op in metrics.values()]) 87 | 88 | # Get the op to reset the local variables used in tf.metrics 89 | metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") 90 | metrics_init_op = tf.variables_initializer(metric_variables) 91 | 92 | # Summaries for training 93 | tf.summary.scalar('loss', loss) 94 | tf.summary.scalar('accuracy', accuracy) 95 | 96 | # ----------------------------------------------------------- 97 | # MODEL SPECIFICATION 98 | # Create the model specification and return it 99 | # It contains nodes or operations in the graph that will be used for training and evaluation 100 | model_spec = inputs 101 | variable_init_op = tf.group(*[tf.global_variables_initializer(), tf.tables_initializer()]) 102 | model_spec['variable_init_op'] = variable_init_op 103 | model_spec["predictions"] = predictions 104 | model_spec['loss'] = loss 105 | model_spec['accuracy'] = accuracy 106 | model_spec['metrics_init_op'] = metrics_init_op 107 | model_spec['metrics'] = metrics 108 | model_spec['update_metrics'] = update_metrics_op 109 | model_spec['summary_op'] = tf.summary.merge_all() 110 | 111 | if is_training: 112 | model_spec['train_op'] = train_op 113 | 114 | return model_spec 115 | -------------------------------------------------------------------------------- /tensorflow/nlp/model/training.py: -------------------------------------------------------------------------------- 1 | """Tensorflow utility functions for training""" 2 | 3 | import logging 4 | import os 5 | 6 | from tqdm import trange 7 | import tensorflow as tf 8 | 9 | from model.utils import save_dict_to_json 10 | from model.evaluation import evaluate_sess 11 | 12 | 13 | def train_sess(sess, model_spec, num_steps, writer, params): 14 | """Train the model on `num_steps` batches 15 | 16 | Args: 17 | sess: (tf.Session) current session 18 | model_spec: (dict) contains the graph operations or nodes needed for training 19 | num_steps: (int) train for this number of batches 20 | writer: (tf.summary.FileWriter) writer for summaries 21 | params: (Params) hyperparameters 22 | """ 23 | # Get relevant graph operations or nodes needed for training 24 | loss = model_spec['loss'] 25 | train_op = model_spec['train_op'] 26 | update_metrics = model_spec['update_metrics'] 27 | metrics = model_spec['metrics'] 28 | summary_op = model_spec['summary_op'] 29 | global_step = tf.train.get_global_step() 30 | 31 | # Load the training dataset into the pipeline and initialize the metrics local variables 32 | sess.run(model_spec['iterator_init_op']) 33 | sess.run(model_spec['metrics_init_op']) 34 | 35 | # Use tqdm for progress bar 36 | t = trange(num_steps) 37 | for i in t: 38 | # Evaluate summaries for tensorboard only once in a while 39 | if i % params.save_summary_steps == 0: 40 | # Perform a mini-batch update 41 | _, _, loss_val, summ, global_step_val = sess.run([train_op, update_metrics, loss, 42 | summary_op, global_step]) 43 | # Write summaries for tensorboard 44 | writer.add_summary(summ, global_step_val) 45 | else: 46 | _, _, loss_val = sess.run([train_op, update_metrics, loss]) 47 | # Log the loss in the tqdm progress bar 48 | t.set_postfix(loss='{:05.3f}'.format(loss_val)) 49 | 50 | 51 | metrics_values = {k: v[0] for k, v in metrics.items()} 52 | metrics_val = sess.run(metrics_values) 53 | metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_val.items()) 54 | logging.info("- Train metrics: " + metrics_string) 55 | 56 | 57 | def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, restore_from=None): 58 | """Train the model and evaluate every epoch. 59 | 60 | Args: 61 | train_model_spec: (dict) contains the graph operations or nodes needed for training 62 | eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation 63 | model_dir: (string) directory containing config, weights and log 64 | params: (Params) contains hyperparameters of the model. 65 | Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps 66 | restore_from: (string) directory or file containing weights to restore the graph 67 | """ 68 | # Initialize tf.Saver instances to save weights during training 69 | last_saver = tf.train.Saver() # will keep last 5 epochs 70 | best_saver = tf.train.Saver(max_to_keep=1) # only keep 1 best checkpoint (best on eval) 71 | begin_at_epoch = 0 72 | 73 | with tf.Session() as sess: 74 | # Initialize model variables 75 | sess.run(train_model_spec['variable_init_op']) 76 | 77 | # Reload weights from directory if specified 78 | if restore_from is not None: 79 | logging.info("Restoring parameters from {}".format(restore_from)) 80 | if os.path.isdir(restore_from): 81 | restore_from = tf.train.latest_checkpoint(restore_from) 82 | begin_at_epoch = int(restore_from.split('-')[-1]) 83 | last_saver.restore(sess, restore_from) 84 | 85 | # For tensorboard (takes care of writing summaries to files) 86 | train_writer = tf.summary.FileWriter(os.path.join(model_dir, 'train_summaries'), sess.graph) 87 | eval_writer = tf.summary.FileWriter(os.path.join(model_dir, 'eval_summaries'), sess.graph) 88 | 89 | best_eval_acc = 0.0 90 | for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): 91 | # Run one epoch 92 | logging.info("Epoch {}/{}".format(epoch + 1, begin_at_epoch + params.num_epochs)) 93 | # Compute number of batches in one epoch (one full pass over the training set) 94 | num_steps = (params.train_size + params.batch_size - 1) // params.batch_size 95 | train_sess(sess, train_model_spec, num_steps, train_writer, params) 96 | 97 | # Save weights 98 | last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') 99 | last_saver.save(sess, last_save_path, global_step=epoch + 1) 100 | 101 | # Evaluate for one epoch on validation set 102 | num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size 103 | metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer) 104 | 105 | # If best_eval, best_save_path 106 | eval_acc = metrics['accuracy'] 107 | if eval_acc >= best_eval_acc: 108 | # Store new best accuracy 109 | best_eval_acc = eval_acc 110 | # Save weights 111 | best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') 112 | best_save_path = best_saver.save(sess, best_save_path, global_step=epoch + 1) 113 | logging.info("- Found new best accuracy, saving in {}".format(best_save_path)) 114 | # Save best eval metrics in a json file in the model directory 115 | best_json_path = os.path.join(model_dir, "metrics_eval_best_weights.json") 116 | save_dict_to_json(metrics, best_json_path) 117 | 118 | # Save latest eval metrics in a json file in the model directory 119 | last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") 120 | save_dict_to_json(metrics, last_json_path) 121 | -------------------------------------------------------------------------------- /tensorflow/nlp/model/utils.py: -------------------------------------------------------------------------------- 1 | """General utility functions""" 2 | 3 | import json 4 | import logging 5 | 6 | 7 | class Params(): 8 | """Class that loads hyperparameters from a json file. 9 | 10 | Example: 11 | ``` 12 | params = Params(json_path) 13 | print(params.learning_rate) 14 | params.learning_rate = 0.5 # change the value of learning_rate in params 15 | ``` 16 | """ 17 | 18 | def __init__(self, json_path): 19 | self.update(json_path) 20 | 21 | def save(self, json_path): 22 | """Saves parameters to json file""" 23 | with open(json_path, 'w') as f: 24 | json.dump(self.__dict__, f, indent=4) 25 | 26 | def update(self, json_path): 27 | """Loads parameters from json file""" 28 | with open(json_path) as f: 29 | params = json.load(f) 30 | self.__dict__.update(params) 31 | 32 | @property 33 | def dict(self): 34 | """Gives dict-like access to Params instance by `params.dict['learning_rate']`""" 35 | return self.__dict__ 36 | 37 | 38 | def set_logger(log_path): 39 | """Sets the logger to log info in terminal and file `log_path`. 40 | 41 | In general, it is useful to have a logger so that every output to the terminal is saved 42 | in a permanent file. Here we save it to `model_dir/train.log`. 43 | 44 | Example: 45 | ``` 46 | logging.info("Starting training...") 47 | ``` 48 | 49 | Args: 50 | log_path: (string) where to log 51 | """ 52 | logger = logging.getLogger() 53 | logger.setLevel(logging.INFO) 54 | 55 | if not logger.handlers: 56 | # Logging to a file 57 | file_handler = logging.FileHandler(log_path) 58 | file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) 59 | logger.addHandler(file_handler) 60 | 61 | # Logging to console 62 | stream_handler = logging.StreamHandler() 63 | stream_handler.setFormatter(logging.Formatter('%(message)s')) 64 | logger.addHandler(stream_handler) 65 | 66 | 67 | def save_dict_to_json(d, json_path): 68 | """Saves dict of floats in json file 69 | 70 | Args: 71 | d: (dict) of float-castable values (np.float, int, float, etc.) 72 | json_path: (string) path to json file 73 | """ 74 | with open(json_path, 'w') as f: 75 | # We need to convert the values to float for json (it doesn't accept np.array, np.float, ) 76 | d = {k: float(v) for k, v in d.items()} 77 | json.dump(d, f, indent=4) 78 | -------------------------------------------------------------------------------- /tensorflow/nlp/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.15.0 2 | tabulate 3 | tqdm 4 | -------------------------------------------------------------------------------- /tensorflow/nlp/search_hyperparams.py: -------------------------------------------------------------------------------- 1 | """Peform hyperparemeters search""" 2 | 3 | import argparse 4 | import os 5 | from subprocess import check_call 6 | import sys 7 | 8 | from model.utils import Params 9 | 10 | 11 | PYTHON = sys.executable 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--parent_dir', default='experiments/learning_rate', 14 | help="Directory containing params.json") 15 | parser.add_argument('--data_dir', default='data/small', 16 | help="Directory containing the dataset") 17 | 18 | 19 | def launch_training_job(parent_dir, data_dir, job_name, params): 20 | """Launch training of the model with a set of hyperparameters in parent_dir/job_name 21 | 22 | Args: 23 | parent_dir: (string) directory containing config, weights and log 24 | data_dir: (string) directory containing the dataset 25 | params: (dict) containing hyperparameters 26 | """ 27 | # Create a new folder in parent_dir with unique_name "job_name" 28 | model_dir = os.path.join(parent_dir, job_name) 29 | if not os.path.exists(model_dir): 30 | os.makedirs(model_dir) 31 | 32 | # Write parameters in json file 33 | json_path = os.path.join(model_dir, 'params.json') 34 | params.save(json_path) 35 | 36 | # Launch training with this config 37 | cmd = "{python} train.py --model_dir {model_dir} --data_dir {data_dir}".format(python=PYTHON, 38 | model_dir=model_dir, data_dir=data_dir) 39 | print(cmd) 40 | check_call(cmd, shell=True) 41 | 42 | 43 | if __name__ == "__main__": 44 | # Load the "reference" parameters from parent_dir json file 45 | args = parser.parse_args() 46 | json_path = os.path.join(args.parent_dir, 'params.json') 47 | assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) 48 | params = Params(json_path) 49 | 50 | # Perform hypersearch over one parameter 51 | learning_rates = [1e-4, 1e-3, 1e-2] 52 | 53 | for learning_rate in learning_rates: 54 | # Modify the relevant parameter in params 55 | params.learning_rate = learning_rate 56 | 57 | # Launch job (name has to be unique) 58 | job_name = "learning_rate_{}".format(learning_rate) 59 | launch_training_job(args.parent_dir, args.data_dir, job_name, params) 60 | -------------------------------------------------------------------------------- /tensorflow/nlp/synthesize_results.py: -------------------------------------------------------------------------------- 1 | """Aggregates results from the metrics_eval_best_weights.json in a parent folder""" 2 | 3 | import argparse 4 | import json 5 | import os 6 | 7 | from tabulate import tabulate 8 | 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--parent_dir', default='experiments', 12 | help='Directory containing results of experiments') 13 | 14 | 15 | def aggregate_metrics(parent_dir, metrics): 16 | """Aggregate the metrics of all experiments in folder `parent_dir`. 17 | 18 | Assumes that `parent_dir` contains multiple experiments, with their results stored in 19 | `parent_dir/subdir/metrics_dev.json` 20 | 21 | Args: 22 | parent_dir: (string) path to directory containing experiments results 23 | metrics: (dict) subdir -> {'accuracy': ..., ...} 24 | """ 25 | # Get the metrics for the folder if it has results from an experiment 26 | metrics_file = os.path.join(parent_dir, 'metrics_eval_best_weights.json') 27 | if os.path.isfile(metrics_file): 28 | with open(metrics_file, 'r') as f: 29 | metrics[parent_dir] = json.load(f) 30 | 31 | # Check every subdirectory of parent_dir 32 | for subdir in os.listdir(parent_dir): 33 | if not os.path.isdir(os.path.join(parent_dir, subdir)): 34 | continue 35 | else: 36 | aggregate_metrics(os.path.join(parent_dir, subdir), metrics) 37 | 38 | 39 | def metrics_to_table(metrics): 40 | # Get the headers from the first subdir. Assumes everything has the same metrics 41 | headers = metrics[list(metrics.keys())[0]].keys() 42 | table = [[subdir] + [values[h] for h in headers] for subdir, values in metrics.items()] 43 | res = tabulate(table, headers, tablefmt='pipe') 44 | 45 | return res 46 | 47 | 48 | if __name__ == "__main__": 49 | args = parser.parse_args() 50 | 51 | # Aggregate metrics from args.parent_dir directory 52 | metrics = dict() 53 | aggregate_metrics(args.parent_dir, metrics) 54 | table = metrics_to_table(metrics) 55 | 56 | # Display the table to terminal 57 | print(table) 58 | 59 | # Save results in parent_dir/results.md 60 | save_file = os.path.join(args.parent_dir, "results.md") 61 | with open(save_file, 'w') as f: 62 | f.write(table) 63 | -------------------------------------------------------------------------------- /tensorflow/nlp/train.py: -------------------------------------------------------------------------------- 1 | """Train the model""" 2 | 3 | import argparse 4 | import logging 5 | import os 6 | 7 | import tensorflow as tf 8 | 9 | from model.utils import Params 10 | from model.utils import set_logger 11 | from model.training import train_and_evaluate 12 | from model.input_fn import input_fn 13 | from model.input_fn import load_dataset_from_text 14 | from model.model_fn import model_fn 15 | 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--model_dir', default='experiments/base_model', 19 | help="Directory containing params.json") 20 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset") 21 | parser.add_argument('--restore_dir', default=None, 22 | help="Optional, directory containing weights to reload before training") 23 | 24 | 25 | if __name__ == '__main__': 26 | # Set the random seed for the whole graph for reproductible experiments 27 | tf.set_random_seed(230) 28 | 29 | # Load the parameters from the experiment params.json file in model_dir 30 | args = parser.parse_args() 31 | json_path = os.path.join(args.model_dir, 'params.json') 32 | assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) 33 | params = Params(json_path) 34 | 35 | # Load the parameters from the dataset, that gives the size etc. into params 36 | json_path = os.path.join(args.data_dir, 'dataset_params.json') 37 | assert os.path.isfile(json_path), "No json file found at {}, run build_vocab.py".format(json_path) 38 | params.update(json_path) 39 | num_oov_buckets = params.num_oov_buckets # number of buckets for unknown words 40 | 41 | # Check that we are not overwriting some previous experiment 42 | # Comment these lines if you are developing your model and don't care about overwritting 43 | model_dir_has_best_weights = os.path.isdir(os.path.join(args.model_dir, "best_weights")) 44 | overwritting = model_dir_has_best_weights and args.restore_dir is None 45 | assert not overwritting, "Weights found in model_dir, aborting to avoid overwrite" 46 | 47 | # Set the logger 48 | set_logger(os.path.join(args.model_dir, 'train.log')) 49 | 50 | # Get paths for vocabularies and dataset 51 | path_words = os.path.join(args.data_dir, 'words.txt') 52 | path_tags = os.path.join(args.data_dir, 'tags.txt') 53 | path_train_sentences = os.path.join(args.data_dir, 'train/sentences.txt') 54 | path_train_labels = os.path.join(args.data_dir, 'train/labels.txt') 55 | path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt') 56 | path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt') 57 | 58 | # Load Vocabularies 59 | words = tf.contrib.lookup.index_table_from_file(path_words, num_oov_buckets=num_oov_buckets) 60 | tags = tf.contrib.lookup.index_table_from_file(path_tags) 61 | 62 | # Create the input data pipeline 63 | logging.info("Creating the datasets...") 64 | train_sentences = load_dataset_from_text(path_train_sentences, words) 65 | train_labels = load_dataset_from_text(path_train_labels, tags) 66 | eval_sentences = load_dataset_from_text(path_eval_sentences, words) 67 | eval_labels = load_dataset_from_text(path_eval_labels, tags) 68 | 69 | # Specify other parameters for the dataset and the model 70 | params.eval_size = params.dev_size 71 | params.buffer_size = params.train_size # buffer size for shuffling 72 | params.id_pad_word = words.lookup(tf.constant(params.pad_word)) 73 | params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag)) 74 | 75 | # Create the two iterators over the two datasets 76 | train_inputs = input_fn('train', train_sentences, train_labels, params) 77 | eval_inputs = input_fn('eval', eval_sentences, eval_labels, params) 78 | logging.info("- done.") 79 | 80 | # Define the models (2 different set of nodes that share weights for train and eval) 81 | logging.info("Creating the model...") 82 | train_model_spec = model_fn('train', train_inputs, params) 83 | eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True) 84 | logging.info("- done.") 85 | 86 | # Train the model 87 | logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) 88 | train_and_evaluate(train_model_spec, eval_model_spec, args.model_dir, params, args.restore_dir) -------------------------------------------------------------------------------- /tensorflow/vision/README.md: -------------------------------------------------------------------------------- 1 | # Hand Signs Recognition with Tensorflow 2 | 3 | _Authors: Olivier Moindrot and Guillaume Genthial_ 4 | 5 | Take the time to read the [tutorials](https://cs230-stanford.github.io). 6 | 7 | Note: all scripts must be run in folder `tensorflow/vision`. 8 | 9 | ## Requirements 10 | 11 | We recommend using python3 and a virtual env. See instructions [here](https://cs230-stanford.github.io/project-starter-code.html). 12 | 13 | ``` 14 | virtualenv -p python3 .env 15 | source .env/bin/activate 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | When you're done working on the project, deactivate the virtual environment with `deactivate`. 20 | 21 | ## Task 22 | 23 | Given an image of a hand doing a sign representing 0, 1, 2, 3, 4 or 5, predict the correct label. 24 | 25 | ## Download the SIGNS dataset 26 | 27 | For the vision example, we will used the SIGNS dataset created for this class. The dataset is hosted on google drive, download it [here][signs]. 28 | 29 | This will download the SIGNS dataset (~1.1 GB) containing photos of hands signs making numbers between 0 and 5. 30 | Here is the structure of the data: 31 | 32 | ``` 33 | SIGNS/ 34 | train_signs/ 35 | 0_IMG_5864.jpg 36 | ... 37 | test_signs/ 38 | 0_IMG_5942.jpg 39 | ... 40 | ``` 41 | 42 | The images are named following `{label}_IMG_{id}.jpg` where the label is in `[0, 5]`. 43 | The training set contains 1,080 images and the test set contains 120 images. 44 | 45 | Once the download is complete, move the dataset into `data/SIGNS`. 46 | Run the script `build_dataset.py` which will resize the images to size `(64, 64)`. The new reiszed dataset will be located by default in `data/64x64_SIGNS`: 47 | 48 | ```bash 49 | python build_dataset.py --data_dir data/SIGNS --output_dir data/64x64_SIGNS 50 | ``` 51 | 52 | ## Quickstart (~10 min) 53 | 54 | 1. **Build the dataset of size 64x64**: make sure you complete this step before training 55 | 56 | ```bash 57 | python build_dataset.py --data_dir data/SIGNS\ dataset/ --output_dir data/64x64_SIGNS 58 | ``` 59 | 60 | 2. **Your first experiment** We created a `base_model` directory for you under the `experiments` directory. It countains a file `params.json` which sets the parameters for the experiment. It looks like 61 | 62 | ```json 63 | { 64 | "learning_rate": 1e-3, 65 | "batch_size": 32, 66 | "num_epochs": 10, 67 | ... 68 | } 69 | ``` 70 | 71 | For every new experiment, you will need to create a new directory under `experiments` with a similar `params.json` file. 72 | 73 | 3. **Train** your experiment. Simply run 74 | 75 | ``` 76 | python train.py --data_dir data/64x64_SIGNS --model_dir experiments/base_model 77 | ``` 78 | 79 | It will instantiate a model and train it on the training set following the parameters specified in `params.json`. It will also evaluate some metrics on the development set. 80 | 81 | 4. **Your first hyperparameters search** We created a new directory `learning_rate` in `experiments` for you. Now, run 82 | 83 | ``` 84 | python search_hyperparams.py --data_dir data/64x64_SIGNS --parent_dir experiments/learning_rate 85 | ``` 86 | 87 | It will train and evaluate a model with different values of learning rate defined in `search_hyperparams.py` and create a new directory for each experiment under `experiments/learning_rate/`. 88 | 89 | 5. **Display the results** of the hyperparameters search in a nice format 90 | 91 | ``` 92 | python synthesize_results.py --parent_dir experiments/learning_rate 93 | ``` 94 | 95 | 6. **Evaluation on the test set** Once you've run many experiments and selected your best model and hyperparameters based on the performance on the development set, you can finally evaluate the performance of your model on the test set. Run 96 | 97 | ``` 98 | python evaluate.py --data_dir data/64x64_SIGNS --model_dir experiments/base_model 99 | ``` 100 | 101 | ## Guidelines for more advanced use 102 | 103 | We recommend reading through `train.py` to get a high-level overview of the steps: 104 | 105 | - loading the hyperparameters for the experiment (the `params.json`) 106 | - getting the filenames / labels 107 | - creating the input of our model by zipping the filenames and labels together (`input_fn(...)`), reading the images as well as performing batching and shuffling. 108 | - creating the model (=nodes / ops of the `tf.Graph()`) by calling `model_fn(...)` 109 | - training the model for a given number of epochs by calling `train_and_evaluate(...)` 110 | 111 | Once you get the high-level idea, depending on your dataset, you might want to modify 112 | 113 | - `model/model_fn.py` to change the model 114 | - `model/input_fn.py` to change the way you read data 115 | - `train.py` and `evaluate.py` if somes changes in the model or input require changes here 116 | 117 | If you want to compute new metrics for which you can find a [tensorflow implementation](https://www.tensorflow.org/api_docs/python/tf/metrics), you can define it in the `model_fn.py` (add it to the `metrics` dictionnary). It will automatically be updated during the training and will be displayed at the end of each epoch. 118 | 119 | Once you get something working for your dataset, feel free to edit any part of the code to suit your own needs. 120 | 121 | ## Resources 122 | 123 | Note that this repository uses Tensorflow 1.14. Tensorflow 2 has just been 124 | released, so the links below now point to Tensorflow 2 documentation. You can 125 | navigate to the old 1.14 docs through the API dropdown on navigation bar. 126 | 127 | There are major changes between TF 1 and TF 2, most notably Eager Execution 128 | being the new default mode. If your team is starting with a new project, we 129 | recommend using Tensorflow 2. 130 | 131 | Introduction to the `tf.data` pipeline 132 | 133 | - [programmer's guide](https://www.tensorflow.org/programmers_guide/datasets) 134 | - [reading images](https://www.tensorflow.org/programmers_guide/datasets#decoding_image_data_and_resizing_it) 135 | 136 | [signs]: https://drive.google.com/file/d/1ufiR6hUKhXoAyiBNsySPkUwlvE_wfEHC/view?usp=sharing 137 | -------------------------------------------------------------------------------- /tensorflow/vision/build_dataset.py: -------------------------------------------------------------------------------- 1 | """Split the SIGNS dataset into train/dev/test and resize images to 64x64. 2 | 3 | The SIGNS dataset comes in the following format: 4 | train_signs/ 5 | 0_IMG_5864.jpg 6 | ... 7 | test_signs/ 8 | 0_IMG_5942.jpg 9 | ... 10 | 11 | Original images have size (3024, 3024). 12 | Resizing to (64, 64) reduces the dataset size from 1.16 GB to 4.7 MB, and loading smaller images 13 | makes training faster. 14 | 15 | We already have a test set created, so we only need to split "train_signs" into train and dev sets. 16 | Because we don't have a lot of images and we want that the statistics on the dev set be as 17 | representative as possible, we'll take 20% of "train_signs" as dev set. 18 | """ 19 | 20 | import argparse 21 | import random 22 | import os 23 | 24 | from PIL import Image 25 | from tqdm import tqdm 26 | 27 | 28 | SIZE = 64 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--data_dir', default='data/SIGNS', help="Directory with the SIGNS dataset") 32 | parser.add_argument('--output_dir', default='data/64x64_SIGNS', help="Where to write the new data") 33 | 34 | 35 | def resize_and_save(filename, output_dir, size=SIZE): 36 | """Resize the image contained in `filename` and save it to the `output_dir`""" 37 | image = Image.open(filename) 38 | # Use bilinear interpolation instead of the default "nearest neighbor" method 39 | image = image.resize((size, size), Image.BILINEAR) 40 | image.save(os.path.join(output_dir, filename.split('/')[-1])) 41 | 42 | 43 | if __name__ == '__main__': 44 | args = parser.parse_args() 45 | 46 | assert os.path.isdir(args.data_dir), "Couldn't find the dataset at {}".format(args.data_dir) 47 | 48 | # Define the data directories 49 | train_data_dir = os.path.join(args.data_dir, 'train_signs') 50 | test_data_dir = os.path.join(args.data_dir, 'test_signs') 51 | 52 | # Get the filenames in each directory (train and test) 53 | filenames = os.listdir(train_data_dir) 54 | filenames = [os.path.join(train_data_dir, f) for f in filenames if f.endswith('.jpg')] 55 | 56 | test_filenames = os.listdir(test_data_dir) 57 | test_filenames = [os.path.join(test_data_dir, f) for f in test_filenames if f.endswith('.jpg')] 58 | 59 | # Split the images in 'train_signs' into 80% train and 20% dev 60 | # Make sure to always shuffle with a fixed seed so that the split is reproducible 61 | random.seed(230) 62 | filenames.sort() 63 | random.shuffle(filenames) 64 | 65 | split = int(0.8 * len(filenames)) 66 | train_filenames = filenames[:split] 67 | dev_filenames = filenames[split:] 68 | 69 | filenames = {'train': train_filenames, 70 | 'dev': dev_filenames, 71 | 'test': test_filenames} 72 | 73 | if not os.path.exists(args.output_dir): 74 | os.mkdir(args.output_dir) 75 | else: 76 | print("Warning: output dir {} already exists".format(args.output_dir)) 77 | 78 | # Preprocess train, dev and test 79 | for split in ['train', 'dev', 'test']: 80 | output_dir_split = os.path.join(args.output_dir, '{}_signs'.format(split)) 81 | if not os.path.exists(output_dir_split): 82 | os.mkdir(output_dir_split) 83 | else: 84 | print("Warning: dir {} already exists".format(output_dir_split)) 85 | 86 | print("Processing {} data, saving preprocessed data to {}".format(split, output_dir_split)) 87 | for filename in tqdm(filenames[split]): 88 | resize_and_save(filename, output_dir_split, size=SIZE) 89 | 90 | print("Done building dataset") 91 | -------------------------------------------------------------------------------- /tensorflow/vision/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/vision/data/.gitkeep -------------------------------------------------------------------------------- /tensorflow/vision/evaluate.py: -------------------------------------------------------------------------------- 1 | """Evaluate the model""" 2 | 3 | import argparse 4 | import logging 5 | import os 6 | 7 | import tensorflow as tf 8 | 9 | from model.input_fn import input_fn 10 | from model.model_fn import model_fn 11 | from model.evaluation import evaluate 12 | from model.utils import Params 13 | from model.utils import set_logger 14 | 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--model_dir', default='experiments/test', 18 | help="Experiment directory containing params.json") 19 | parser.add_argument('--data_dir', default='data/64x64_SIGNS', 20 | help="Directory containing the dataset") 21 | parser.add_argument('--restore_from', default='best_weights', 22 | help="Subdirectory of model dir or file containing the weights") 23 | 24 | 25 | if __name__ == '__main__': 26 | # Set the random seed for the whole graph 27 | tf.set_random_seed(230) 28 | 29 | # Load the parameters 30 | args = parser.parse_args() 31 | json_path = os.path.join(args.model_dir, 'params.json') 32 | assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) 33 | params = Params(json_path) 34 | 35 | # Set the logger 36 | set_logger(os.path.join(args.model_dir, 'evaluate.log')) 37 | 38 | # Create the input data pipeline 39 | logging.info("Creating the dataset...") 40 | data_dir = args.data_dir 41 | test_data_dir = os.path.join(data_dir, "test_signs") 42 | 43 | # Get the filenames from the test set 44 | test_filenames = os.listdir(test_data_dir) 45 | test_filenames = [os.path.join(test_data_dir, f) for f in test_filenames if f.endswith('.jpg')] 46 | 47 | test_labels = [int(f.split('/')[-1][0]) for f in test_filenames] 48 | 49 | # specify the size of the evaluation set 50 | params.eval_size = len(test_filenames) 51 | 52 | # create the iterator over the dataset 53 | test_inputs = input_fn(False, test_filenames, test_labels, params) 54 | 55 | # Define the model 56 | logging.info("Creating the model...") 57 | model_spec = model_fn('eval', test_inputs, params, reuse=False) 58 | 59 | logging.info("Starting evaluation") 60 | evaluate(model_spec, args.model_dir, params, args.restore_from) 61 | -------------------------------------------------------------------------------- /tensorflow/vision/experiments/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/vision/experiments/.gitkeep -------------------------------------------------------------------------------- /tensorflow/vision/experiments/base_model/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "learning_rate": 1e-3, 3 | "batch_size": 32, 4 | "num_epochs": 10, 5 | 6 | "num_channels": 16, 7 | "use_batch_norm": true, 8 | "bn_momentum": 0.9, 9 | 10 | "image_size": 64, 11 | "use_random_flip": true, 12 | "num_labels": 6, 13 | 14 | "num_parallel_calls": 4, 15 | "save_summary_steps": 1 16 | } 17 | -------------------------------------------------------------------------------- /tensorflow/vision/experiments/learning_rate/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "learning_rate": 1e-3, 3 | "batch_size": 32, 4 | "num_epochs": 10, 5 | 6 | "num_channels": 16, 7 | "use_batch_norm": true, 8 | "bn_momentum": 0.9, 9 | 10 | "image_size": 64, 11 | "use_random_flip": true, 12 | "num_labels": 6, 13 | 14 | "num_parallel_calls": 4, 15 | "save_summary_steps": 1 16 | } 17 | -------------------------------------------------------------------------------- /tensorflow/vision/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/vision/model/__init__.py -------------------------------------------------------------------------------- /tensorflow/vision/model/evaluation.py: -------------------------------------------------------------------------------- 1 | """Tensorflow utility functions for evaluation""" 2 | 3 | import logging 4 | import os 5 | 6 | from tqdm import trange 7 | import tensorflow as tf 8 | 9 | from model.utils import save_dict_to_json 10 | 11 | 12 | def evaluate_sess(sess, model_spec, num_steps, writer=None, params=None): 13 | """Train the model on `num_steps` batches. 14 | 15 | Args: 16 | sess: (tf.Session) current session 17 | model_spec: (dict) contains the graph operations or nodes needed for training 18 | num_steps: (int) train for this number of batches 19 | writer: (tf.summary.FileWriter) writer for summaries. Is None if we don't log anything 20 | params: (Params) hyperparameters 21 | """ 22 | update_metrics = model_spec['update_metrics'] 23 | eval_metrics = model_spec['metrics'] 24 | global_step = tf.train.get_global_step() 25 | 26 | # Load the evaluation dataset into the pipeline and initialize the metrics init op 27 | sess.run(model_spec['iterator_init_op']) 28 | sess.run(model_spec['metrics_init_op']) 29 | 30 | # compute metrics over the dataset 31 | for _ in range(num_steps): 32 | sess.run(update_metrics) 33 | 34 | # Get the values of the metrics 35 | metrics_values = {k: v[0] for k, v in eval_metrics.items()} 36 | metrics_val = sess.run(metrics_values) 37 | metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_val.items()) 38 | logging.info("- Eval metrics: " + metrics_string) 39 | 40 | # Add summaries manually to writer at global_step_val 41 | if writer is not None: 42 | global_step_val = sess.run(global_step) 43 | for tag, val in metrics_val.items(): 44 | summ = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)]) 45 | writer.add_summary(summ, global_step_val) 46 | 47 | return metrics_val 48 | 49 | 50 | def evaluate(model_spec, model_dir, params, restore_from): 51 | """Evaluate the model 52 | 53 | Args: 54 | model_spec: (dict) contains the graph operations or nodes needed for evaluation 55 | model_dir: (string) directory containing config, weights and log 56 | params: (Params) contains hyperparameters of the model. 57 | Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps 58 | restore_from: (string) directory or file containing weights to restore the graph 59 | """ 60 | # Initialize tf.Saver 61 | saver = tf.train.Saver() 62 | 63 | with tf.Session() as sess: 64 | # Initialize the lookup table 65 | sess.run(model_spec['variable_init_op']) 66 | 67 | # Reload weights from the weights subdirectory 68 | save_path = os.path.join(model_dir, restore_from) 69 | if os.path.isdir(save_path): 70 | save_path = tf.train.latest_checkpoint(save_path) 71 | saver.restore(sess, save_path) 72 | 73 | # Evaluate 74 | num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size 75 | metrics = evaluate_sess(sess, model_spec, num_steps) 76 | metrics_name = '_'.join(restore_from.split('/')) 77 | save_path = os.path.join(model_dir, "metrics_test_{}.json".format(metrics_name)) 78 | save_dict_to_json(metrics, save_path) 79 | -------------------------------------------------------------------------------- /tensorflow/vision/model/input_fn.py: -------------------------------------------------------------------------------- 1 | """Create the input data pipeline using `tf.data`""" 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def _parse_function(filename, label, size): 7 | """Obtain the image from the filename (for both training and validation). 8 | 9 | The following operations are applied: 10 | - Decode the image from jpeg format 11 | - Convert to float and to range [0, 1] 12 | """ 13 | image_string = tf.read_file(filename) 14 | 15 | # Don't use tf.image.decode_image, or the output shape will be undefined 16 | image_decoded = tf.image.decode_jpeg(image_string, channels=3) 17 | 18 | # This will convert to float values in [0, 1] 19 | image = tf.image.convert_image_dtype(image_decoded, tf.float32) 20 | 21 | resized_image = tf.image.resize_images(image, [size, size]) 22 | 23 | return resized_image, label 24 | 25 | 26 | def train_preprocess(image, label, use_random_flip): 27 | """Image preprocessing for training. 28 | 29 | Apply the following operations: 30 | - Horizontally flip the image with probability 1/2 31 | - Apply random brightness and saturation 32 | """ 33 | if use_random_flip: 34 | image = tf.image.random_flip_left_right(image) 35 | 36 | image = tf.image.random_brightness(image, max_delta=32.0 / 255.0) 37 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 38 | 39 | # Make sure the image is still in [0, 1] 40 | image = tf.clip_by_value(image, 0.0, 1.0) 41 | 42 | return image, label 43 | 44 | 45 | def input_fn(is_training, filenames, labels, params): 46 | """Input function for the SIGNS dataset. 47 | 48 | The filenames have format "{label}_IMG_{id}.jpg". 49 | For instance: "data_dir/2_IMG_4584.jpg". 50 | 51 | Args: 52 | is_training: (bool) whether to use the train or test pipeline. 53 | At training, we shuffle the data and have multiple epochs 54 | filenames: (list) filenames of the images, as ["data_dir/{label}_IMG_{id}.jpg"...] 55 | labels: (list) corresponding list of labels 56 | params: (Params) contains hyperparameters of the model (ex: `params.num_epochs`) 57 | """ 58 | num_samples = len(filenames) 59 | assert len(filenames) == len(labels), "Filenames and labels should have same length" 60 | 61 | # Create a Dataset serving batches of images and labels 62 | # We don't repeat for multiple epochs because we always train and evaluate for one epoch 63 | parse_fn = lambda f, l: _parse_function(f, l, params.image_size) 64 | train_fn = lambda f, l: train_preprocess(f, l, params.use_random_flip) 65 | 66 | if is_training: 67 | dataset = (tf.data.Dataset.from_tensor_slices((tf.constant(filenames), tf.constant(labels))) 68 | .shuffle(num_samples) # whole dataset into the buffer ensures good shuffling 69 | .map(parse_fn, num_parallel_calls=params.num_parallel_calls) 70 | .map(train_fn, num_parallel_calls=params.num_parallel_calls) 71 | .batch(params.batch_size) 72 | .prefetch(1) # make sure you always have one batch ready to serve 73 | ) 74 | else: 75 | dataset = (tf.data.Dataset.from_tensor_slices((tf.constant(filenames), tf.constant(labels))) 76 | .map(parse_fn) 77 | .batch(params.batch_size) 78 | .prefetch(1) # make sure you always have one batch ready to serve 79 | ) 80 | 81 | # Create reinitializable iterator from dataset 82 | iterator = dataset.make_initializable_iterator() 83 | images, labels = iterator.get_next() 84 | iterator_init_op = iterator.initializer 85 | 86 | inputs = {'images': images, 'labels': labels, 'iterator_init_op': iterator_init_op} 87 | return inputs 88 | -------------------------------------------------------------------------------- /tensorflow/vision/model/model_fn.py: -------------------------------------------------------------------------------- 1 | """Define the model.""" 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def build_model(is_training, inputs, params): 7 | """Compute logits of the model (output distribution) 8 | 9 | Args: 10 | is_training: (bool) whether we are training or not 11 | inputs: (dict) contains the inputs of the graph (features, labels...) 12 | this can be `tf.placeholder` or outputs of `tf.data` 13 | params: (Params) hyperparameters 14 | 15 | Returns: 16 | output: (tf.Tensor) output of the model 17 | """ 18 | images = inputs['images'] 19 | 20 | assert images.get_shape().as_list() == [None, params.image_size, params.image_size, 3] 21 | 22 | out = images 23 | # Define the number of channels of each convolution 24 | # For each block, we do: 3x3 conv -> batch norm -> relu -> 2x2 maxpool 25 | num_channels = params.num_channels 26 | bn_momentum = params.bn_momentum 27 | channels = [num_channels, num_channels * 2, num_channels * 4, num_channels * 8] 28 | for i, c in enumerate(channels): 29 | with tf.variable_scope('block_{}'.format(i+1)): 30 | out = tf.layers.conv2d(out, c, 3, padding='same') 31 | if params.use_batch_norm: 32 | out = tf.layers.batch_normalization(out, momentum=bn_momentum, training=is_training) 33 | out = tf.nn.relu(out) 34 | out = tf.layers.max_pooling2d(out, 2, 2) 35 | 36 | assert out.get_shape().as_list() == [None, 4, 4, num_channels * 8] 37 | 38 | out = tf.reshape(out, [-1, 4 * 4 * num_channels * 8]) 39 | with tf.variable_scope('fc_1'): 40 | out = tf.layers.dense(out, num_channels * 8) 41 | if params.use_batch_norm: 42 | out = tf.layers.batch_normalization(out, momentum=bn_momentum, training=is_training) 43 | out = tf.nn.relu(out) 44 | with tf.variable_scope('fc_2'): 45 | logits = tf.layers.dense(out, params.num_labels) 46 | 47 | return logits 48 | 49 | 50 | def model_fn(mode, inputs, params, reuse=False): 51 | """Model function defining the graph operations. 52 | 53 | Args: 54 | mode: (string) can be 'train' or 'eval' 55 | inputs: (dict) contains the inputs of the graph (features, labels...) 56 | this can be `tf.placeholder` or outputs of `tf.data` 57 | params: (Params) contains hyperparameters of the model (ex: `params.learning_rate`) 58 | reuse: (bool) whether to reuse the weights 59 | 60 | Returns: 61 | model_spec: (dict) contains the graph operations or nodes needed for training / evaluation 62 | """ 63 | is_training = (mode == 'train') 64 | labels = inputs['labels'] 65 | labels = tf.cast(labels, tf.int64) 66 | 67 | # ----------------------------------------------------------- 68 | # MODEL: define the layers of the model 69 | with tf.variable_scope('model', reuse=reuse): 70 | # Compute the output distribution of the model and the predictions 71 | logits = build_model(is_training, inputs, params) 72 | predictions = tf.argmax(logits, 1) 73 | 74 | # Define loss and accuracy 75 | loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) 76 | accuracy = tf.reduce_mean(tf.cast(tf.equal(labels, predictions), tf.float32)) 77 | 78 | # Define training step that minimizes the loss with the Adam optimizer 79 | if is_training: 80 | optimizer = tf.train.AdamOptimizer(params.learning_rate) 81 | global_step = tf.train.get_or_create_global_step() 82 | if params.use_batch_norm: 83 | # Add a dependency to update the moving mean and variance for batch normalization 84 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 85 | train_op = optimizer.minimize(loss, global_step=global_step) 86 | else: 87 | train_op = optimizer.minimize(loss, global_step=global_step) 88 | 89 | 90 | # ----------------------------------------------------------- 91 | # METRICS AND SUMMARIES 92 | # Metrics for evaluation using tf.metrics (average over whole dataset) 93 | with tf.variable_scope("metrics"): 94 | metrics = { 95 | 'accuracy': tf.metrics.accuracy(labels=labels, predictions=tf.argmax(logits, 1)), 96 | 'loss': tf.metrics.mean(loss) 97 | } 98 | 99 | # Group the update ops for the tf.metrics 100 | update_metrics_op = tf.group(*[op for _, op in metrics.values()]) 101 | 102 | # Get the op to reset the local variables used in tf.metrics 103 | metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") 104 | metrics_init_op = tf.variables_initializer(metric_variables) 105 | 106 | # Summaries for training 107 | tf.summary.scalar('loss', loss) 108 | tf.summary.scalar('accuracy', accuracy) 109 | tf.summary.image('train_image', inputs['images']) 110 | 111 | #TODO: if mode == 'eval': ? 112 | # Add incorrectly labeled images 113 | mask = tf.not_equal(labels, predictions) 114 | 115 | # Add a different summary to know how they were misclassified 116 | for label in range(0, params.num_labels): 117 | mask_label = tf.logical_and(mask, tf.equal(predictions, label)) 118 | incorrect_image_label = tf.boolean_mask(inputs['images'], mask_label) 119 | tf.summary.image('incorrectly_labeled_{}'.format(label), incorrect_image_label) 120 | 121 | # ----------------------------------------------------------- 122 | # MODEL SPECIFICATION 123 | # Create the model specification and return it 124 | # It contains nodes or operations in the graph that will be used for training and evaluation 125 | model_spec = inputs 126 | model_spec['variable_init_op'] = tf.global_variables_initializer() 127 | model_spec["predictions"] = predictions 128 | model_spec['loss'] = loss 129 | model_spec['accuracy'] = accuracy 130 | model_spec['metrics_init_op'] = metrics_init_op 131 | model_spec['metrics'] = metrics 132 | model_spec['update_metrics'] = update_metrics_op 133 | model_spec['summary_op'] = tf.summary.merge_all() 134 | 135 | if is_training: 136 | model_spec['train_op'] = train_op 137 | 138 | return model_spec 139 | -------------------------------------------------------------------------------- /tensorflow/vision/model/training.py: -------------------------------------------------------------------------------- 1 | """Tensorflow utility functions for training""" 2 | 3 | import logging 4 | import os 5 | 6 | from tqdm import trange 7 | import tensorflow as tf 8 | 9 | from model.utils import save_dict_to_json 10 | from model.evaluation import evaluate_sess 11 | 12 | 13 | def train_sess(sess, model_spec, num_steps, writer, params): 14 | """Train the model on `num_steps` batches 15 | 16 | Args: 17 | sess: (tf.Session) current session 18 | model_spec: (dict) contains the graph operations or nodes needed for training 19 | num_steps: (int) train for this number of batches 20 | writer: (tf.summary.FileWriter) writer for summaries 21 | params: (Params) hyperparameters 22 | """ 23 | # Get relevant graph operations or nodes needed for training 24 | loss = model_spec['loss'] 25 | train_op = model_spec['train_op'] 26 | update_metrics = model_spec['update_metrics'] 27 | metrics = model_spec['metrics'] 28 | summary_op = model_spec['summary_op'] 29 | global_step = tf.train.get_global_step() 30 | 31 | # Load the training dataset into the pipeline and initialize the metrics local variables 32 | sess.run(model_spec['iterator_init_op']) 33 | sess.run(model_spec['metrics_init_op']) 34 | 35 | # Use tqdm for progress bar 36 | t = trange(num_steps) 37 | for i in t: 38 | # Evaluate summaries for tensorboard only once in a while 39 | if i % params.save_summary_steps == 0: 40 | # Perform a mini-batch update 41 | _, _, loss_val, summ, global_step_val = sess.run([train_op, update_metrics, loss, 42 | summary_op, global_step]) 43 | # Write summaries for tensorboard 44 | writer.add_summary(summ, global_step_val) 45 | else: 46 | _, _, loss_val = sess.run([train_op, update_metrics, loss]) 47 | # Log the loss in the tqdm progress bar 48 | t.set_postfix(loss='{:05.3f}'.format(loss_val)) 49 | 50 | 51 | metrics_values = {k: v[0] for k, v in metrics.items()} 52 | metrics_val = sess.run(metrics_values) 53 | metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_val.items()) 54 | logging.info("- Train metrics: " + metrics_string) 55 | 56 | 57 | def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, restore_from=None): 58 | """Train the model and evaluate every epoch. 59 | 60 | Args: 61 | train_model_spec: (dict) contains the graph operations or nodes needed for training 62 | eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation 63 | model_dir: (string) directory containing config, weights and log 64 | params: (Params) contains hyperparameters of the model. 65 | Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps 66 | restore_from: (string) directory or file containing weights to restore the graph 67 | """ 68 | # Initialize tf.Saver instances to save weights during training 69 | last_saver = tf.train.Saver() # will keep last 5 epochs 70 | best_saver = tf.train.Saver(max_to_keep=1) # only keep 1 best checkpoint (best on eval) 71 | begin_at_epoch = 0 72 | 73 | with tf.Session() as sess: 74 | # Initialize model variables 75 | sess.run(train_model_spec['variable_init_op']) 76 | 77 | # Reload weights from directory if specified 78 | if restore_from is not None: 79 | logging.info("Restoring parameters from {}".format(restore_from)) 80 | if os.path.isdir(restore_from): 81 | restore_from = tf.train.latest_checkpoint(restore_from) 82 | begin_at_epoch = int(restore_from.split('-')[-1]) 83 | last_saver.restore(sess, restore_from) 84 | 85 | # For tensorboard (takes care of writing summaries to files) 86 | train_writer = tf.summary.FileWriter(os.path.join(model_dir, 'train_summaries'), sess.graph) 87 | eval_writer = tf.summary.FileWriter(os.path.join(model_dir, 'eval_summaries'), sess.graph) 88 | 89 | best_eval_acc = 0.0 90 | for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): 91 | # Run one epoch 92 | logging.info("Epoch {}/{}".format(epoch + 1, begin_at_epoch + params.num_epochs)) 93 | # Compute number of batches in one epoch (one full pass over the training set) 94 | num_steps = (params.train_size + params.batch_size - 1) // params.batch_size 95 | train_sess(sess, train_model_spec, num_steps, train_writer, params) 96 | 97 | # Save weights 98 | last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch') 99 | last_saver.save(sess, last_save_path, global_step=epoch + 1) 100 | 101 | # Evaluate for one epoch on validation set 102 | num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size 103 | metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer) 104 | 105 | # If best_eval, best_save_path 106 | eval_acc = metrics['accuracy'] 107 | if eval_acc >= best_eval_acc: 108 | # Store new best accuracy 109 | best_eval_acc = eval_acc 110 | # Save weights 111 | best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch') 112 | best_save_path = best_saver.save(sess, best_save_path, global_step=epoch + 1) 113 | logging.info("- Found new best accuracy, saving in {}".format(best_save_path)) 114 | # Save best eval metrics in a json file in the model directory 115 | best_json_path = os.path.join(model_dir, "metrics_eval_best_weights.json") 116 | save_dict_to_json(metrics, best_json_path) 117 | 118 | # Save latest eval metrics in a json file in the model directory 119 | last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") 120 | save_dict_to_json(metrics, last_json_path) 121 | -------------------------------------------------------------------------------- /tensorflow/vision/model/utils.py: -------------------------------------------------------------------------------- 1 | """General utility functions""" 2 | 3 | import json 4 | import logging 5 | 6 | 7 | class Params(): 8 | """Class that loads hyperparameters from a json file. 9 | 10 | Example: 11 | ``` 12 | params = Params(json_path) 13 | print(params.learning_rate) 14 | params.learning_rate = 0.5 # change the value of learning_rate in params 15 | ``` 16 | """ 17 | 18 | def __init__(self, json_path): 19 | self.update(json_path) 20 | 21 | def save(self, json_path): 22 | """Saves parameters to json file""" 23 | with open(json_path, 'w') as f: 24 | json.dump(self.__dict__, f, indent=4) 25 | 26 | def update(self, json_path): 27 | """Loads parameters from json file""" 28 | with open(json_path) as f: 29 | params = json.load(f) 30 | self.__dict__.update(params) 31 | 32 | @property 33 | def dict(self): 34 | """Gives dict-like access to Params instance by `params.dict['learning_rate']`""" 35 | return self.__dict__ 36 | 37 | 38 | def set_logger(log_path): 39 | """Sets the logger to log info in terminal and file `log_path`. 40 | 41 | In general, it is useful to have a logger so that every output to the terminal is saved 42 | in a permanent file. Here we save it to `model_dir/train.log`. 43 | 44 | Example: 45 | ``` 46 | logging.info("Starting training...") 47 | ``` 48 | 49 | Args: 50 | log_path: (string) where to log 51 | """ 52 | logger = logging.getLogger() 53 | logger.setLevel(logging.INFO) 54 | 55 | if not logger.handlers: 56 | # Logging to a file 57 | file_handler = logging.FileHandler(log_path) 58 | file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')) 59 | logger.addHandler(file_handler) 60 | 61 | # Logging to console 62 | stream_handler = logging.StreamHandler() 63 | stream_handler.setFormatter(logging.Formatter('%(message)s')) 64 | logger.addHandler(stream_handler) 65 | 66 | 67 | def save_dict_to_json(d, json_path): 68 | """Saves dict of floats in json file 69 | 70 | Args: 71 | d: (dict) of float-castable values (np.float, int, float, etc.) 72 | json_path: (string) path to json file 73 | """ 74 | with open(json_path, 'w') as f: 75 | # We need to convert the values to float for json (it doesn't accept np.array, np.float, ) 76 | d = {k: float(v) for k, v in d.items()} 77 | json.dump(d, f, indent=4) 78 | -------------------------------------------------------------------------------- /tensorflow/vision/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | Pillow 3 | tensorflow==1.15.0 4 | tabulate 5 | tqdm 6 | -------------------------------------------------------------------------------- /tensorflow/vision/search_hyperparams.py: -------------------------------------------------------------------------------- 1 | """Peform hyperparemeters search""" 2 | 3 | import argparse 4 | import os 5 | from subprocess import check_call 6 | import sys 7 | 8 | from model.utils import Params 9 | 10 | 11 | PYTHON = sys.executable 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--parent_dir', default='experiments/learning_rate', 14 | help="Directory containing params.json") 15 | parser.add_argument('--data_dir', default='data/64x64_SIGNS', 16 | help="Directory containing the dataset") 17 | 18 | 19 | def launch_training_job(parent_dir, data_dir, job_name, params): 20 | """Launch training of the model with a set of hyperparameters in parent_dir/job_name 21 | 22 | Args: 23 | parent_dir: (string) directory containing config, weights and log 24 | data_dir: (string) directory containing the dataset 25 | params: (dict) containing hyperparameters 26 | """ 27 | # Create a new folder in parent_dir with unique_name "job_name" 28 | model_dir = os.path.join(parent_dir, job_name) 29 | if not os.path.exists(model_dir): 30 | os.makedirs(model_dir) 31 | 32 | # Write parameters in json file 33 | json_path = os.path.join(model_dir, 'params.json') 34 | params.save(json_path) 35 | 36 | # Launch training with this config 37 | cmd = "{python} train.py --model_dir {model_dir} --data_dir {data_dir}".format(python=PYTHON, 38 | model_dir=model_dir, data_dir=data_dir) 39 | print(cmd) 40 | check_call(cmd, shell=True) 41 | 42 | 43 | if __name__ == "__main__": 44 | # Load the "reference" parameters from parent_dir json file 45 | args = parser.parse_args() 46 | json_path = os.path.join(args.parent_dir, 'params.json') 47 | assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path) 48 | params = Params(json_path) 49 | 50 | # Perform hypersearch over one parameter 51 | learning_rates = [1e-4, 1e-3, 1e-2] 52 | 53 | for learning_rate in learning_rates: 54 | # Modify the relevant parameter in params 55 | params.learning_rate = learning_rate 56 | 57 | # Launch job (name has to be unique) 58 | job_name = "learning_rate_{}".format(learning_rate) 59 | launch_training_job(args.parent_dir, args.data_dir, job_name, params) 60 | -------------------------------------------------------------------------------- /tensorflow/vision/synthesize_results.py: -------------------------------------------------------------------------------- 1 | """Aggregates results from the metrics_eval_best_weights.json in a parent folder""" 2 | 3 | import argparse 4 | import json 5 | import os 6 | 7 | from tabulate import tabulate 8 | 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--parent_dir', default='experiments', 12 | help='Directory containing results of experiments') 13 | 14 | 15 | def aggregate_metrics(parent_dir, metrics): 16 | """Aggregate the metrics of all experiments in folder `parent_dir`. 17 | 18 | Assumes that `parent_dir` contains multiple experiments, with their results stored in 19 | `parent_dir/subdir/metrics_dev.json` 20 | 21 | Args: 22 | parent_dir: (string) path to directory containing experiments results 23 | metrics: (dict) subdir -> {'accuracy': ..., ...} 24 | """ 25 | # Get the metrics for the folder if it has results from an experiment 26 | metrics_file = os.path.join(parent_dir, 'metrics_eval_best_weights.json') 27 | if os.path.isfile(metrics_file): 28 | with open(metrics_file, 'r') as f: 29 | metrics[parent_dir] = json.load(f) 30 | 31 | # Check every subdirectory of parent_dir 32 | for subdir in os.listdir(parent_dir): 33 | if not os.path.isdir(os.path.join(parent_dir, subdir)): 34 | continue 35 | else: 36 | aggregate_metrics(os.path.join(parent_dir, subdir), metrics) 37 | 38 | 39 | def metrics_to_table(metrics): 40 | # Get the headers from the first subdir. Assumes everything has the same metrics 41 | headers = metrics[list(metrics.keys())[0]].keys() 42 | table = [[subdir] + [values[h] for h in headers] for subdir, values in metrics.items()] 43 | res = tabulate(table, headers, tablefmt='pipe') 44 | 45 | return res 46 | 47 | 48 | if __name__ == "__main__": 49 | args = parser.parse_args() 50 | 51 | # Aggregate metrics from args.parent_dir directory 52 | metrics = dict() 53 | aggregate_metrics(args.parent_dir, metrics) 54 | table = metrics_to_table(metrics) 55 | 56 | # Display the table to terminal 57 | print(table) 58 | 59 | # Save results in parent_dir/results.md 60 | save_file = os.path.join(args.parent_dir, "results.md") 61 | with open(save_file, 'w') as f: 62 | f.write(table) 63 | -------------------------------------------------------------------------------- /tensorflow/vision/train.py: -------------------------------------------------------------------------------- 1 | """Train the model""" 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import random 7 | 8 | import tensorflow as tf 9 | 10 | from model.input_fn import input_fn 11 | from model.utils import Params 12 | from model.utils import set_logger 13 | from model.utils import save_dict_to_json 14 | from model.model_fn import model_fn 15 | from model.training import train_and_evaluate 16 | 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--model_dir', default='experiments/test', 20 | help="Experiment directory containing params.json") 21 | parser.add_argument('--data_dir', default='data/64x64_SIGNS', 22 | help="Directory containing the dataset") 23 | parser.add_argument('--restore_from', default=None, 24 | help="Optional, directory or file containing weights to reload before training") 25 | 26 | 27 | if __name__ == '__main__': 28 | # Set the random seed for the whole graph for reproductible experiments 29 | tf.set_random_seed(230) 30 | 31 | # Load the parameters from json file 32 | args = parser.parse_args() 33 | json_path = os.path.join(args.model_dir, 'params.json') 34 | assert os.path.isfile( 35 | json_path), "No json configuration file found at {}".format(json_path) 36 | params = Params(json_path) 37 | 38 | # Check that we are not overwriting some previous experiment 39 | # Comment these lines if you are developing your model and don't care about overwritting 40 | model_dir_has_best_weights = os.path.isdir( 41 | os.path.join(args.model_dir, "best_weights")) 42 | overwritting = model_dir_has_best_weights and args.restore_from is None 43 | assert not overwritting, "Weights found in model_dir, aborting to avoid overwrite" 44 | 45 | # Set the logger 46 | set_logger(os.path.join(args.model_dir, 'train.log')) 47 | 48 | # Create the input data pipeline 49 | logging.info("Creating the datasets...") 50 | data_dir = args.data_dir 51 | train_data_dir = os.path.join(data_dir, "train_signs") 52 | dev_data_dir = os.path.join(data_dir, "dev_signs") 53 | 54 | # Get the filenames from the train and dev sets 55 | train_filenames = [os.path.join(train_data_dir, f) for f in os.listdir(train_data_dir) 56 | if f.endswith('.jpg')] 57 | eval_filenames = [os.path.join(dev_data_dir, f) for f in os.listdir(dev_data_dir) 58 | if f.endswith('.jpg')] 59 | 60 | # Labels will be between 0 and 5 included (6 classes in total) 61 | train_labels = [int(f.split('/')[-1][0]) for f in train_filenames] 62 | eval_labels = [int(f.split('/')[-1][0]) for f in eval_filenames] 63 | 64 | # Specify the sizes of the dataset we train on and evaluate on 65 | params.train_size = len(train_filenames) 66 | params.eval_size = len(eval_filenames) 67 | 68 | # Create the two iterators over the two datasets 69 | train_inputs = input_fn(True, train_filenames, train_labels, params) 70 | eval_inputs = input_fn(False, eval_filenames, eval_labels, params) 71 | 72 | # Define the model 73 | logging.info("Creating the model...") 74 | train_model_spec = model_fn('train', train_inputs, params) 75 | eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True) 76 | 77 | # Train the model 78 | logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) 79 | train_and_evaluate(train_model_spec, eval_model_spec, 80 | args.model_dir, params, args.restore_from) 81 | --------------------------------------------------------------------------------