├── .gitignore
├── LICENSE
├── README.md
├── pytorch
    ├── nlp
    │   ├── README.md
    │   ├── build_kaggle_dataset.py
    │   ├── build_vocab.py
    │   ├── data
    │   │   ├── kaggle
    │   │   │   └── .gitkeep
    │   │   └── small
    │   │   │   ├── test
    │   │   │       ├── labels.txt
    │   │   │       └── sentences.txt
    │   │   │   ├── train
    │   │   │       ├── labels.txt
    │   │   │       └── sentences.txt
    │   │   │   └── val
    │   │   │       ├── labels.txt
    │   │   │       └── sentences.txt
    │   ├── evaluate.py
    │   ├── experiments
    │   │   ├── base_model
    │   │   │   └── params.json
    │   │   └── learning_rate
    │   │   │   └── params.json
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── data_loader.py
    │   │   └── net.py
    │   ├── requirements.txt
    │   ├── search_hyperparams.py
    │   ├── synthesize_results.py
    │   ├── train.py
    │   └── utils.py
    └── vision
    │   ├── README.md
    │   ├── build_dataset.py
    │   ├── data
    │       └── .gitkeep
    │   ├── evaluate.py
    │   ├── experiments
    │       ├── .gitkeep
    │       ├── base_model
    │       │   └── params.json
    │       └── learning_rate
    │       │   └── params.json
    │   ├── model
    │       ├── __init__.py
    │       ├── data_loader.py
    │       └── net.py
    │   ├── requirements.txt
    │   ├── search_hyperparams.py
    │   ├── synthesize_results.py
    │   ├── train.py
    │   └── utils.py
└── tensorflow
    ├── nlp
        ├── README.md
        ├── build_kaggle_dataset.py
        ├── build_vocab.py
        ├── data
        │   ├── kaggle
        │   │   └── .gitkeep
        │   └── small
        │   │   ├── dev
        │   │       ├── labels.txt
        │   │       └── sentences.txt
        │   │   ├── test
        │   │       ├── labels.txt
        │   │       └── sentences.txt
        │   │   └── train
        │   │       ├── labels.txt
        │   │       └── sentences.txt
        ├── evaluate.py
        ├── experiments
        │   ├── .gitkeep
        │   ├── base_model
        │   │   └── params.json
        │   └── learning_rate
        │   │   └── params.json
        ├── model
        │   ├── __init__.py
        │   ├── evaluation.py
        │   ├── input_fn.py
        │   ├── model_fn.py
        │   ├── training.py
        │   └── utils.py
        ├── requirements.txt
        ├── search_hyperparams.py
        ├── synthesize_results.py
        └── train.py
    └── vision
        ├── README.md
        ├── build_dataset.py
        ├── data
            └── .gitkeep
        ├── evaluate.py
        ├── experiments
            ├── .gitkeep
            ├── base_model
            │   └── params.json
            └── learning_rate
            │   └── params.json
        ├── model
            ├── __init__.py
            ├── evaluation.py
            ├── input_fn.py
            ├── model_fn.py
            ├── training.py
            └── utils.py
        ├── requirements.txt
        ├── search_hyperparams.py
        ├── synthesize_results.py
        └── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | .static_storage/
 56 | .media/
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 CS230 Teaching team
 4 | 
 5 | Teaching assistants contributors (Winter 2018): Guillaume Genthial, Olivier Moindrot, Surag Nair.
 6 | Instructors: Kian Katanforoosh, Andrew Ng.
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | 
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CS230 Code Examples
 2 | 
 3 | [Tutorials](https://cs230-stanford.github.io)
 4 | 
 5 | 
 6 | We are happy to introduce some code examples that you can use for your CS230 projects. The code contains examples for TensorFlow and PyTorch, in vision and NLP. The structure of the repository is the following:
 7 | 
 8 | ```
 9 | README.md
10 | pytorch/
11 |     vision/
12 |         README.md
13 |     nlp/
14 |         README.md
15 | tensorflow/
16 |     vision/
17 |         README.md
18 |     nlp/
19 |         README.md
20 | ```
21 | 
22 | You'll find a README.md in each sub-directory.


--------------------------------------------------------------------------------
/pytorch/nlp/README.md:
--------------------------------------------------------------------------------
  1 | # Named Entity Recognition with PyTorch
  2 | 
  3 | _Authors: Surag Nair, Guillaume Genthial and Olivier Moindrot_
  4 | 
  5 | Take the time to read the [tutorials](https://cs230-stanford.github.io/project-starter-code.html).
  6 | 
  7 | Note : all scripts must be run in `pytorch/nlp`.
  8 | 
  9 | ## Requirements
 10 | 
 11 | We recommend using python3 and a virtual env. See instructions [here](https://cs230-stanford.github.io/project-starter-code.html).
 12 | 
 13 | ```
 14 | virtualenv -p python3 .env
 15 | source .env/bin/activate
 16 | pip install -r requirements.txt
 17 | ```
 18 | 
 19 | When you're done working on the project, deactivate the virtual environment with `deactivate`.
 20 | 
 21 | ## Task
 22 | 
 23 | Given a sentence, give a tag to each word ([Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition))
 24 | 
 25 | ```
 26 | John   lives in New   York
 27 | B-PER  O     O  B-LOC I-LOC
 28 | ```
 29 | 
 30 | ## [optional] Download the Kaggle dataset (~5 min)
 31 | 
 32 | We provide a small subset of the kaggle dataset (30 sentences) for testing in `data/small` but you are encouraged to download the original version on the [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data) website.
 33 | 
 34 | 1. **Download the dataset** `ner_dataset.csv` on [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data) and save it under the `nlp/data/kaggle` directory. Make sure you download the simple version `ner_dataset.csv` and NOT the full version `ner.csv`.
 35 | 
 36 | 2. **Build the dataset** Run the following script
 37 | 
 38 | ```
 39 | python build_kaggle_dataset.py
 40 | ```
 41 | 
 42 | It will extract the sentences and labels from the dataset, split it into train/val/test and save it in a convenient format for our model.
 43 | 
 44 | _Debug_ If you get some errors, check that you downloaded the right file and saved it in the right directory. If you have issues with encoding, try running the script with python 2.7.
 45 | 
 46 | 3. In the next section, change `data/small` by `data/kaggle`
 47 | 
 48 | ## Quickstart (~10 min)
 49 | 
 50 | 1. **Build** vocabularies and parameters for your dataset by running
 51 | 
 52 | ```
 53 | python build_vocab.py --data_dir data/small
 54 | ```
 55 | 
 56 | It will write vocabulary files `words.txt` and `tags.txt` containing the words and tags in the dataset. It will also save a `dataset_params.json` with some extra information.
 57 | 
 58 | 2. **Your first experiment** We created a `base_model` directory for you under the `experiments` directory. It contains a file `params.json` which sets the hyperparameters for the experiment. It looks like
 59 | 
 60 | ```json
 61 | {
 62 |   "learning_rate": 1e-3,
 63 |   "batch_size": 5,
 64 |   "num_epochs": 2
 65 | }
 66 | ```
 67 | 
 68 | For every new experiment, you will need to create a new directory under `experiments` with a `params.json` file.
 69 | 
 70 | 3. **Train** your experiment. Simply run
 71 | 
 72 | ```
 73 | python train.py --data_dir data/small --model_dir experiments/base_model
 74 | ```
 75 | 
 76 | It will instantiate a model and train it on the training set following the hyperparameters specified in `params.json`. It will also evaluate some metrics on the development set.
 77 | 
 78 | 4. **Your first hyperparameters search** We created a new directory `learning_rate` in `experiments` for you. Now, run
 79 | 
 80 | ```
 81 | python search_hyperparams.py --data_dir data/small --parent_dir experiments/learning_rate
 82 | ```
 83 | 
 84 | It will train and evaluate a model with different values of learning rate defined in `search_hyperparams.py` and create a new directory for each experiment under `experiments/learning_rate/`.
 85 | 
 86 | 5. **Display the results** of the hyperparameters search in a nice format
 87 | 
 88 | ```
 89 | python synthesize_results.py --parent_dir experiments/learning_rate
 90 | ```
 91 | 
 92 | 6. **Evaluation on the test set** Once you've run many experiments and selected your best model and hyperparameters based on the performance on the development set, you can finally evaluate the performance of your model on the test set. Run
 93 | 
 94 | ```
 95 | python evaluate.py --data_dir data/small --model_dir experiments/base_model
 96 | ```
 97 | 
 98 | ## Guidelines for more advanced use
 99 | 
100 | We recommend reading through `train.py` to get a high-level overview of the training loop steps:
101 | 
102 | - loading the hyperparameters for the experiment (the `params.json`)
103 | - loading the training and validation data
104 | - creating the model, loss_fn and metrics
105 | - training the model for a given number of epochs by calling `train_and_evaluate(...)`
106 | 
107 | You can then go through `model/data_loader.py` to understand the following steps:
108 | 
109 | - loading the vocabularies from the `words.txt` and `tags.txt` files
110 | - creating the sentences/labels datasets from the text files
111 | - how the vocabulary is used to map tokens to their indices
112 | - how the `data_iterator` creates a batch of data and labels and pads sentences
113 | 
114 | Once you get the high-level idea, depending on your dataset, you might want to modify
115 | 
116 | - `model/model.py` to change the neural network, loss function and metrics
117 | - `model/data_loader.py` to suit the data loader to your specific needs
118 | - `train.py` for changing the optimizer
119 | - `train.py` and `evaluate.py` for some changes in the model or input require changes here
120 | 
121 | Once you get something working for your dataset, feel free to edit any part of the code to suit your own needs.
122 | 
123 | ## Resources
124 | 
125 | - [PyTorch documentation](http://pytorch.org/docs/1.2.0/)
126 | - [Tutorials](http://pytorch.org/tutorials/)
127 | - [PyTorch warm-up](https://github.com/jcjohnson/pytorch-examples)
128 | 


--------------------------------------------------------------------------------
/pytorch/nlp/build_kaggle_dataset.py:
--------------------------------------------------------------------------------
 1 | """Read, split and save the kaggle dataset for our model"""
 2 | 
 3 | import csv
 4 | import os
 5 | import sys
 6 | 
 7 | 
 8 | def load_dataset(path_csv):
 9 |     """Loads dataset into memory from csv file"""
10 |     # Open the csv file, need to specify the encoding for python3
11 |     use_python3 = sys.version_info[0] >= 3
12 |     with (open(path_csv, encoding="windows-1252") if use_python3 else open(path_csv)) as f:
13 |         csv_file = csv.reader(f, delimiter=',')
14 |         dataset = []
15 |         words, tags = [], []
16 | 
17 |         # Each line of the csv corresponds to one word
18 |         for idx, row in enumerate(csv_file):
19 |             if idx == 0: continue
20 |             sentence, word, pos, tag = row
21 |             # If the first column is non empty it means we reached a new sentence
22 |             if len(sentence) != 0:
23 |                 if len(words) > 0:
24 |                     assert len(words) == len(tags)
25 |                     dataset.append((words, tags))
26 |                     words, tags = [], []
27 |             try:
28 |                 word, tag = str(word), str(tag)
29 |                 words.append(word)
30 |                 tags.append(tag)
31 |             except UnicodeDecodeError as e:
32 |                 print("An exception was raised, skipping a word: {}".format(e))
33 |                 pass
34 | 
35 |     return dataset
36 | 
37 | 
38 | def save_dataset(dataset, save_dir):
39 |     """Writes sentences.txt and labels.txt files in save_dir from dataset
40 | 
41 |     Args:
42 |         dataset: ([(["a", "cat"], ["O", "O"]), ...])
43 |         save_dir: (string)
44 |     """
45 |     # Create directory if it doesn't exist
46 |     print("Saving in {}...".format(save_dir))
47 |     if not os.path.exists(save_dir):
48 |         os.makedirs(save_dir)
49 | 
50 |     # Export the dataset
51 |     with open(os.path.join(save_dir, 'sentences.txt'), 'w') as file_sentences:
52 |         with open(os.path.join(save_dir, 'labels.txt'), 'w') as file_labels:
53 |             for words, tags in dataset:
54 |                 file_sentences.write("{}\n".format(" ".join(words)))
55 |                 file_labels.write("{}\n".format(" ".join(tags)))
56 |     print("- done.")
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     # Check that the dataset exists (you need to make sure you haven't downloaded the `ner.csv`)
61 |     path_dataset = 'data/kaggle/ner_dataset.csv'
62 |     msg = "{} file not found. Make sure you have downloaded the right dataset".format(path_dataset)
63 |     assert os.path.isfile(path_dataset), msg
64 | 
65 |     # Load the dataset into memory
66 |     print("Loading Kaggle dataset into memory...")
67 |     dataset = load_dataset(path_dataset)
68 |     print("- done.")
69 | 
70 |     # Split the dataset into train, val and split (dummy split with no shuffle)
71 |     train_dataset = dataset[:int(0.7*len(dataset))]
72 |     val_dataset = dataset[int(0.7*len(dataset)) : int(0.85*len(dataset))]
73 |     test_dataset = dataset[int(0.85*len(dataset)):]
74 | 
75 |     # Save the datasets to files
76 |     save_dataset(train_dataset, 'data/kaggle/train')
77 |     save_dataset(val_dataset, 'data/kaggle/val')
78 |     save_dataset(test_dataset, 'data/kaggle/test')


--------------------------------------------------------------------------------
/pytorch/nlp/build_vocab.py:
--------------------------------------------------------------------------------
  1 | """Build vocabularies of words and tags from datasets"""
  2 | 
  3 | import argparse
  4 | from collections import Counter
  5 | import json
  6 | import os
  7 | 
  8 | 
  9 | parser = argparse.ArgumentParser()
 10 | parser.add_argument('--min_count_word', default=1, help="Minimum count for words in the dataset", type=int)
 11 | parser.add_argument('--min_count_tag', default=1, help="Minimum count for tags in the dataset", type=int)
 12 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset")
 13 | 
 14 | # Hyper parameters for the vocab
 15 | PAD_WORD = '<pad>'
 16 | PAD_TAG = 'O'
 17 | UNK_WORD = 'UNK'
 18 | 
 19 | 
 20 | def save_vocab_to_txt_file(vocab, txt_path):
 21 |     """Writes one token per line, 0-based line id corresponds to the id of the token.
 22 | 
 23 |     Args:
 24 |         vocab: (iterable object) yields token
 25 |         txt_path: (stirng) path to vocab file
 26 |     """
 27 |     with open(txt_path, "w") as f:
 28 |         for token in vocab:
 29 |             f.write(token + '\n')
 30 |             
 31 | 
 32 | def save_dict_to_json(d, json_path):
 33 |     """Saves dict to json file
 34 | 
 35 |     Args:
 36 |         d: (dict)
 37 |         json_path: (string) path to json file
 38 |     """
 39 |     with open(json_path, 'w') as f:
 40 |         d = {k: v for k, v in d.items()}
 41 |         json.dump(d, f, indent=4)
 42 | 
 43 | 
 44 | def update_vocab(txt_path, vocab):
 45 |     """Update word and tag vocabulary from dataset
 46 | 
 47 |     Args:
 48 |         txt_path: (string) path to file, one sentence per line
 49 |         vocab: (dict or Counter) with update method
 50 | 
 51 |     Returns:
 52 |         dataset_size: (int) number of elements in the dataset
 53 |     """
 54 |     with open(txt_path) as f:
 55 |         for i, line in enumerate(f):
 56 |             vocab.update(line.strip().split(' '))
 57 | 
 58 |     return i + 1
 59 | 
 60 | 
 61 | if __name__ == '__main__':
 62 |     args = parser.parse_args()
 63 | 
 64 |     # Build word vocab with train and test datasets
 65 |     print("Building word vocabulary...")
 66 |     words = Counter()
 67 |     size_train_sentences = update_vocab(os.path.join(args.data_dir, 'train/sentences.txt'), words)
 68 |     size_dev_sentences = update_vocab(os.path.join(args.data_dir, 'val/sentences.txt'), words)
 69 |     size_test_sentences = update_vocab(os.path.join(args.data_dir, 'test/sentences.txt'), words)
 70 |     print("- done.")
 71 | 
 72 |     # Build tag vocab with train and test datasets
 73 |     print("Building tag vocabulary...")
 74 |     tags = Counter()
 75 |     size_train_tags = update_vocab(os.path.join(args.data_dir, 'train/labels.txt'), tags)
 76 |     size_dev_tags = update_vocab(os.path.join(args.data_dir, 'val/labels.txt'), tags)
 77 |     size_test_tags = update_vocab(os.path.join(args.data_dir, 'test/labels.txt'), tags)
 78 |     print("- done.")
 79 | 
 80 |     # Assert same number of examples in datasets
 81 |     assert size_train_sentences == size_train_tags
 82 |     assert size_dev_sentences == size_dev_tags
 83 |     assert size_test_sentences == size_test_tags
 84 | 
 85 |     # Only keep most frequent tokens
 86 |     words = [tok for tok, count in words.items() if count >= args.min_count_word]
 87 |     tags = [tok for tok, count in tags.items() if count >= args.min_count_tag]
 88 | 
 89 |     # Add pad tokens
 90 |     if PAD_WORD not in words: words.append(PAD_WORD)
 91 |     if PAD_TAG not in tags: tags.append(PAD_TAG)
 92 |     
 93 |     # add word for unknown words 
 94 |     words.append(UNK_WORD)
 95 | 
 96 |     # Save vocabularies to file
 97 |     print("Saving vocabularies to file...")
 98 |     save_vocab_to_txt_file(words, os.path.join(args.data_dir, 'words.txt'))
 99 |     save_vocab_to_txt_file(tags, os.path.join(args.data_dir, 'tags.txt'))
100 |     print("- done.")
101 | 
102 |     # Save datasets properties in json file
103 |     sizes = {
104 |         'train_size': size_train_sentences,
105 |         'dev_size': size_dev_sentences,
106 |         'test_size': size_test_sentences,
107 |         'vocab_size': len(words),
108 |         'number_of_tags': len(tags),
109 |         'pad_word': PAD_WORD,
110 |         'pad_tag': PAD_TAG,
111 |         'unk_word': UNK_WORD
112 |     }
113 |     save_dict_to_json(sizes, os.path.join(args.data_dir, 'dataset_params.json'))
114 | 
115 |     # Logging sizes
116 |     to_print = "\n".join("- {}: {}".format(k, v) for k, v in sizes.items())
117 |     print("Characteristics of the dataset:\n{}".format(to_print))
118 | 


--------------------------------------------------------------------------------
/pytorch/nlp/data/kaggle/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/nlp/data/kaggle/.gitkeep


--------------------------------------------------------------------------------
/pytorch/nlp/data/small/test/labels.txt:
--------------------------------------------------------------------------------
 1 | O O B-org I-org I-org I-org O B-geo O B-gpe B-per I-per I-per I-per O O O O O O B-geo O O
 2 | O O O O O O O O O
 3 | O O O O O B-geo O O O B-tim O O O O O O O
 4 | O O O O O O O O O O O O O B-geo O O O O O O O O O O
 5 | O B-org I-org I-org I-org I-org O O O O B-org I-org I-org I-org I-org O O O O O O O O O O B-geo O
 6 | O O O O O O O O O O B-geo O O B-org I-org I-org I-org O
 7 | O O O O B-org O O O O O O O O O B-geo O O O O
 8 | O O O O B-geo O O O O O O O O O O O O O O O O O O O B-geo O
 9 | B-gpe O O O O O O O O O O O O O O B-gpe O O B-per I-per O
10 | O O O O O O B-tim O O O O O O O O O O B-geo O


--------------------------------------------------------------------------------
/pytorch/nlp/data/small/test/sentences.txt:
--------------------------------------------------------------------------------
 1 | At the Group of Eight summit in Scotland , Japanese Prime Minister Junichiro Koizumi said he is outraged by the London attacks .
 2 | He noted terrorist acts must not be forgivable .
 3 | Sarin gas attacks on the Tokyo subway system in 1995 killed 12 people and injured thousands .
 4 | A human rights group has called on Asian leaders to increase pressure on Burma to hasten democratic reforms and stop human rights abuses .
 5 | The Alternative ASEAN Network for Burma said officials from the Association of Southeast Asian Nations meeting this week should consider new options in dealing with Burma .
 6 | It said leaders should consider supporting a possible resolution on Burma by the United Nations Security Council .
 7 | The group also urged ASEAN leaders to acknowledge the many security problems caused by Burma 's military regime .
 8 | The rights group accuses Burma 's government of involvement in illegal drug trafficking and human rights abuses , especially against some ethnic groups in Burma .
 9 | Iraqi officials say gunmen have killed a member of the secular coalition led by former Iraqi prime minister Ayad Allawi .
10 | Officials say Faras al-Jabouri was shot Saturday after gunmen raided his home near the northern city of Mosul .


--------------------------------------------------------------------------------
/pytorch/nlp/data/small/train/labels.txt:
--------------------------------------------------------------------------------
 1 | O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O
 2 | O O O O O O O O O O O O O O O O O O B-per O O O O O O O O O O O
 3 | O O O O O O O O O O O B-geo I-geo O
 4 | O O O O O O O O O O O O O O O
 5 | O O O O O O O O O O O B-geo O O B-org I-org O O O B-gpe O O O B-geo O
 6 | O O O O O B-gpe O O O O B-geo O O O O O O O B-gpe O O O O O
 7 | O B-geo O O O O O O O O O O O O B-geo O B-geo O O B-geo O
 8 | O B-org I-org I-org I-org O O O O O O O O B-geo B-tim O O O O O B-gpe O O O O O O O
 9 | B-gpe O O O O O O O O O O B-geo O O O
10 | B-gpe O O O O O O O O O O O O O O B-tim O O O B-org O O O O O


--------------------------------------------------------------------------------
/pytorch/nlp/data/small/train/sentences.txt:
--------------------------------------------------------------------------------
 1 | Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
 2 | Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "
 3 | They marched from the Houses of Parliament to a rally in Hyde Park .
 4 | Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .
 5 | The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton .
 6 | The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country .
 7 | The London march came ahead of anti-war protests today in other cities , including Rome , Paris , and Madrid .
 8 | The International Atomic Energy Agency is to hold second day of talks in Vienna Wednesday on how to respond to Iran 's resumption of low-level uranium conversion .
 9 | Iran this week restarted parts of the conversion process at its Isfahan nuclear plant .
10 | Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .


--------------------------------------------------------------------------------
/pytorch/nlp/data/small/val/labels.txt:
--------------------------------------------------------------------------------
 1 | B-per I-per O O O B-tim O O B-tim O O O B-geo O O O O O O O O O O
 2 | O O O O B-gpe B-per I-per I-per O O B-org I-org I-org I-org I-org O O O O O O O O B-geo O O O O O O O O O
 3 | O B-org I-org I-org O O O O O O O O
 4 | B-gpe O O O O O O O B-gpe O O O O O O O O O O O O O O O O B-geo O
 5 | O O O B-org I-org O O O O O B-geo O O O O B-org O O O O B-gpe O O O O O
 6 | O O O B-per I-per I-per O O B-tim O B-gpe O O O O O O O B-gpe O O B-geo O O O O O O O O B-geo O
 7 | B-org O O O O O O O O O O O O O O O O O O O O O O
 8 | O B-tim O B-gpe O O O O O O O O O O O O O O O O O O B-org O O O
 9 | O B-gpe O O O O O O O O O O O O O O O O O O B-org O B-org O
10 | B-geo O O O O O O O O O O O O O O O O O O O O O O O


--------------------------------------------------------------------------------
/pytorch/nlp/data/small/val/sentences.txt:
--------------------------------------------------------------------------------
 1 | Mr. Nour was arrested in January and spent six weeks in a Cairo jail , before his release on bond last week .
 2 | In a letter to Egyptian President Hosni Mubarak , the New York-based Human Rights Watch said it was dismayed by what it called Cairo 's " radical intolerance " toward political dissent .
 3 | The U.S. State Department and the European parliament also voiced concern .
 4 | Pakistani military officials say 14 of about 40 Pakistani soldiers who went missing following an attack on a security checkpoint have been found in neighboring Afghanistan .
 5 | Officials say the Frontier Corps paramilitary troops disappeared from the Mohmand tribal region after a Taliban insurgent attack along the Afghan border earlier this week .
 6 | Military spokesman Major General Athar Abbas told reporters Thursday that Afghan authorities handed over the troops to the Pakistani consulate in Jalalabad and the soldiers were being flown back to Pakistan .
 7 | Taliban militants said they captured 10 soldiers during the attack on the paramilitary post , but officials could not verify the claim .
 8 | On Wednesday , Pakistani officials said 10 paramilitary soldiers and at least 36 militants were killed in fighting in the country 's Bajaur tribal region .
 9 | The Pakistani military has twice declared victory there following offensives aimed at clearing the area of insurgents linked to the Taliban and al-Qaida .
10 | Thailand 's military has named a committee to begin the process of writing a new constitution , following a military coup last month .


--------------------------------------------------------------------------------
/pytorch/nlp/evaluate.py:
--------------------------------------------------------------------------------
  1 | """Evaluates the model"""
  2 | 
  3 | import argparse
  4 | import logging
  5 | import os
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import utils
 10 | import model.net as net
 11 | from model.data_loader import DataLoader
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset")
 15 | parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing params.json")
 16 | parser.add_argument('--restore_file', default='best', help="name of the file in --model_dir \
 17 |                      containing weights to load")
 18 | 
 19 | 
 20 | def evaluate(model, loss_fn, data_iterator, metrics, params, num_steps):
 21 |     """Evaluate the model on `num_steps` batches.
 22 | 
 23 |     Args:
 24 |         model: (torch.nn.Module) the neural network
 25 |         loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
 26 |         data_iterator: (generator) a generator that generates batches of data and labels
 27 |         metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
 28 |         params: (Params) hyperparameters
 29 |         num_steps: (int) number of batches to train on, each of size params.batch_size
 30 |     """
 31 | 
 32 |     # set model to evaluation mode
 33 |     model.eval()
 34 | 
 35 |     # summary for current eval loop
 36 |     summ = []
 37 | 
 38 |     # compute metrics over the dataset
 39 |     for _ in range(num_steps):
 40 |         # fetch the next evaluation batch
 41 |         data_batch, labels_batch = next(data_iterator)
 42 |         
 43 |         # compute model output
 44 |         output_batch = model(data_batch)
 45 |         loss = loss_fn(output_batch, labels_batch)
 46 | 
 47 |         # extract data from torch Variable, move to cpu, convert to numpy arrays
 48 |         output_batch = output_batch.data.cpu().numpy()
 49 |         labels_batch = labels_batch.data.cpu().numpy()
 50 | 
 51 |         # compute all metrics on this batch
 52 |         summary_batch = {metric: metrics[metric](output_batch, labels_batch)
 53 |                          for metric in metrics}
 54 |         summary_batch['loss'] = loss.item()
 55 |         summ.append(summary_batch)
 56 | 
 57 |     # compute mean of all metrics in summary
 58 |     metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]} 
 59 |     metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_mean.items())
 60 |     logging.info("- Eval metrics : " + metrics_string)
 61 |     return metrics_mean
 62 | 
 63 | 
 64 | if __name__ == '__main__':
 65 |     """
 66 |         Evaluate the model on the test set.
 67 |     """
 68 |     # Load the parameters
 69 |     args = parser.parse_args()
 70 |     json_path = os.path.join(args.model_dir, 'params.json')
 71 |     assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
 72 |     params = utils.Params(json_path)
 73 | 
 74 |     # use GPU if available
 75 |     params.cuda = torch.cuda.is_available()     # use GPU is available
 76 | 
 77 |     # Set the random seed for reproducible experiments
 78 |     torch.manual_seed(230)
 79 |     if params.cuda: torch.cuda.manual_seed(230)
 80 |         
 81 |     # Get the logger
 82 |     utils.set_logger(os.path.join(args.model_dir, 'evaluate.log'))
 83 | 
 84 |     # Create the input data pipeline
 85 |     logging.info("Creating the dataset...")
 86 | 
 87 |     # load data
 88 |     data_loader = DataLoader(args.data_dir, params)
 89 |     data = data_loader.load_data(['test'], args.data_dir)
 90 |     test_data = data['test']
 91 | 
 92 |     # specify the test set size
 93 |     params.test_size = test_data['size']
 94 |     test_data_iterator = data_loader.data_iterator(test_data, params)
 95 | 
 96 |     logging.info("- done.")
 97 | 
 98 |     # Define the model
 99 |     model = net.Net(params).cuda() if params.cuda else net.Net(params)
100 |     
101 |     loss_fn = net.loss_fn
102 |     metrics = net.metrics
103 |     
104 |     logging.info("Starting evaluation")
105 | 
106 |     # Reload weights from the saved file
107 |     utils.load_checkpoint(os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model)
108 | 
109 |     # Evaluate
110 |     num_steps = (params.test_size + 1) // params.batch_size
111 |     test_metrics = evaluate(model, loss_fn, test_data_iterator, metrics, params, num_steps)
112 |     save_path = os.path.join(args.model_dir, "metrics_test_{}.json".format(args.restore_file))
113 |     utils.save_dict_to_json(test_metrics, save_path)
114 | 


--------------------------------------------------------------------------------
/pytorch/nlp/experiments/base_model/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "learning_rate": 1e-3,
 3 |     "batch_size": 5,
 4 |     "num_epochs": 10,
 5 | 
 6 |     "lstm_hidden_dim": 50,
 7 |     "embedding_dim": 50,
 8 | 
 9 |     "save_summary_steps": 100
10 | }
11 | 


--------------------------------------------------------------------------------
/pytorch/nlp/experiments/learning_rate/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "learning_rate": 1e-3,
 3 |     "batch_size": 5,
 4 |     "num_epochs": 10,
 5 | 
 6 |     "lstm_hidden_dim": 50,
 7 |     "embedding_dim": 50,
 8 | 
 9 |     "save_summary_steps": 100
10 | }
11 | 


--------------------------------------------------------------------------------
/pytorch/nlp/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/nlp/model/__init__.py


--------------------------------------------------------------------------------
/pytorch/nlp/model/data_loader.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import os
  4 | import sys
  5 | 
  6 | import torch
  7 | from torch.autograd import Variable
  8 | 
  9 | import utils 
 10 | 
 11 | 
 12 | class DataLoader(object):
 13 |     """
 14 |     Handles all aspects of the data. Stores the dataset_params, vocabulary and tags with their mappings to indices.
 15 |     """
 16 |     def __init__(self, data_dir, params):
 17 |         """
 18 |         Loads dataset_params, vocabulary and tags. Ensure you have run `build_vocab.py` on data_dir before using this
 19 |         class.
 20 | 
 21 |         Args:
 22 |             data_dir: (string) directory containing the dataset
 23 |             params: (Params) hyperparameters of the training process. This function modifies params and appends
 24 |                     dataset_params (such as vocab size, num_of_tags etc.) to params.
 25 |         """
 26 | 
 27 |         # loading dataset_params
 28 |         json_path = os.path.join(data_dir, 'dataset_params.json')
 29 |         assert os.path.isfile(json_path), "No json file found at {}, run build_vocab.py".format(json_path)
 30 |         self.dataset_params = utils.Params(json_path)        
 31 |         
 32 |         # loading vocab (we require this to map words to their indices)
 33 |         vocab_path = os.path.join(data_dir, 'words.txt')
 34 |         self.vocab = {}
 35 |         with open(vocab_path) as f:
 36 |             for i, l in enumerate(f.read().splitlines()):
 37 |                 self.vocab[l] = i
 38 |         
 39 |         # setting the indices for UNKnown words and PADding symbols
 40 |         self.unk_ind = self.vocab[self.dataset_params.unk_word]
 41 |         self.pad_ind = self.vocab[self.dataset_params.pad_word]
 42 |                 
 43 |         # loading tags (we require this to map tags to their indices)
 44 |         tags_path = os.path.join(data_dir, 'tags.txt')
 45 |         self.tag_map = {}
 46 |         with open(tags_path) as f:
 47 |             for i, t in enumerate(f.read().splitlines()):
 48 |                 self.tag_map[t] = i
 49 | 
 50 |         # adding dataset parameters to param (e.g. vocab size, )
 51 |         params.update(json_path)
 52 | 
 53 |     def load_sentences_labels(self, sentences_file, labels_file, d):
 54 |         """
 55 |         Loads sentences and labels from their corresponding files. Maps tokens and tags to their indices and stores
 56 |         them in the provided dict d.
 57 | 
 58 |         Args:
 59 |             sentences_file: (string) file with sentences with tokens space-separated
 60 |             labels_file: (string) file with NER tags for the sentences in labels_file
 61 |             d: (dict) a dictionary in which the loaded data is stored
 62 |         """
 63 | 
 64 |         sentences = []
 65 |         labels = []
 66 | 
 67 |         with open(sentences_file) as f:
 68 |             for sentence in f.read().splitlines():
 69 |                 # replace each token by its index if it is in vocab
 70 |                 # else use index of UNK_WORD
 71 |                 s = [self.vocab[token] if token in self.vocab 
 72 |                      else self.unk_ind
 73 |                      for token in sentence.split(' ')]
 74 |                 sentences.append(s)
 75 |         
 76 |         with open(labels_file) as f:
 77 |             for sentence in f.read().splitlines():
 78 |                 # replace each label by its index
 79 |                 l = [self.tag_map[label] for label in sentence.split(' ')]
 80 |                 labels.append(l)        
 81 | 
 82 |         # checks to ensure there is a tag for each token
 83 |         assert len(labels) == len(sentences)
 84 |         for i in range(len(labels)):
 85 |             assert len(labels[i]) == len(sentences[i])
 86 | 
 87 |         # storing sentences and labels in dict d
 88 |         d['data'] = sentences
 89 |         d['labels'] = labels
 90 |         d['size'] = len(sentences)
 91 | 
 92 |     def load_data(self, types, data_dir):
 93 |         """
 94 |         Loads the data for each type in types from data_dir.
 95 | 
 96 |         Args:
 97 |             types: (list) has one or more of 'train', 'val', 'test' depending on which data is required
 98 |             data_dir: (string) directory containing the dataset
 99 | 
100 |         Returns:
101 |             data: (dict) contains the data with labels for each type in types
102 | 
103 |         """
104 |         data = {}
105 |         
106 |         for split in ['train', 'val', 'test']:
107 |             if split in types:
108 |                 sentences_file = os.path.join(data_dir, split, "sentences.txt")
109 |                 labels_file = os.path.join(data_dir, split, "labels.txt")
110 |                 data[split] = {}
111 |                 self.load_sentences_labels(sentences_file, labels_file, data[split])
112 | 
113 |         return data
114 | 
115 |     def data_iterator(self, data, params, shuffle=False):
116 |         """
117 |         Returns a generator that yields batches data with labels. Batch size is params.batch_size. Expires after one
118 |         pass over the data.
119 | 
120 |         Args:
121 |             data: (dict) contains data which has keys 'data', 'labels' and 'size'
122 |             params: (Params) hyperparameters of the training process.
123 |             shuffle: (bool) whether the data should be shuffled
124 | 
125 |         Yields:
126 |             batch_data: (Variable) dimension batch_size x seq_len with the sentence data
127 |             batch_labels: (Variable) dimension batch_size x seq_len with the corresponding labels
128 | 
129 |         """
130 | 
131 |         # make a list that decides the order in which we go over the data- this avoids explicit shuffling of data
132 |         order = list(range(data['size']))
133 |         if shuffle:
134 |             random.seed(230)
135 |             random.shuffle(order)
136 | 
137 |         # one pass over data
138 |         for i in range((data['size']+1)//params.batch_size):
139 |             # fetch sentences and tags
140 |             batch_sentences = [data['data'][idx] for idx in order[i*params.batch_size:(i+1)*params.batch_size]]
141 |             batch_tags = [data['labels'][idx] for idx in order[i*params.batch_size:(i+1)*params.batch_size]]
142 | 
143 |             # compute length of longest sentence in batch
144 |             batch_max_len = max([len(s) for s in batch_sentences])
145 | 
146 |             # prepare a numpy array with the data, initialising the data with pad_ind and all labels with -1
147 |             # initialising labels to -1 differentiates tokens with tags from PADding tokens
148 |             batch_data = self.pad_ind*np.ones((len(batch_sentences), batch_max_len))
149 |             batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))
150 | 
151 |             # copy the data to the numpy array
152 |             for j in range(len(batch_sentences)):
153 |                 cur_len = len(batch_sentences[j])
154 |                 batch_data[j][:cur_len] = batch_sentences[j]
155 |                 batch_labels[j][:cur_len] = batch_tags[j]
156 | 
157 |             # since all data are indices, we convert them to torch LongTensors
158 |             batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)
159 | 
160 |             # shift tensors to GPU if available
161 |             if params.cuda:
162 |                 batch_data, batch_labels = batch_data.cuda(), batch_labels.cuda()
163 | 
164 |             # convert them to Variables to record operations in the computational graph
165 |             batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
166 |     
167 |             yield batch_data, batch_labels
168 | 


--------------------------------------------------------------------------------
/pytorch/nlp/model/net.py:
--------------------------------------------------------------------------------
  1 | """Defines the neural network, losss function and metrics"""
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | class Net(nn.Module):
 10 |     """
 11 |     This is the standard way to define your own network in PyTorch. You typically choose the components
 12 |     (e.g. LSTMs, linear layers etc.) of your network in the __init__ function. You then apply these layers
 13 |     on the input step-by-step in the forward function. You can use torch.nn.functional to apply functions
 14 |     such as F.relu, F.sigmoid, F.softmax. Be careful to ensure your dimensions are correct after each step.
 15 | 
 16 |     You are encouraged to have a look at the network in pytorch/vision/model/net.py to get a better sense of how
 17 |     you can go about defining your own network.
 18 | 
 19 |     The documentation for all the various components available to you is here: http://pytorch.org/docs/master/nn.html
 20 |     """
 21 | 
 22 |     def __init__(self, params):
 23 |         """
 24 |         We define an recurrent network that predicts the NER tags for each token in the sentence. The components
 25 |         required are:
 26 | 
 27 |         - an embedding layer: this layer maps each index in range(params.vocab_size) to a params.embedding_dim vector
 28 |         - lstm: applying the LSTM on the sequential input returns an output for each token in the sentence
 29 |         - fc: a fully connected layer that converts the LSTM output for each token to a distribution over NER tags
 30 | 
 31 |         Args:
 32 |             params: (Params) contains vocab_size, embedding_dim, lstm_hidden_dim
 33 |         """
 34 |         super(Net, self).__init__()
 35 | 
 36 |         # the embedding takes as input the vocab_size and the embedding_dim
 37 |         self.embedding = nn.Embedding(params.vocab_size, params.embedding_dim)
 38 | 
 39 |         # the LSTM takes as input the size of its input (embedding_dim), its hidden size
 40 |         # for more details on how to use it, check out the documentation
 41 |         self.lstm = nn.LSTM(params.embedding_dim,
 42 |                             params.lstm_hidden_dim, batch_first=True)
 43 | 
 44 |         # the fully connected layer transforms the output to give the final output layer
 45 |         self.fc = nn.Linear(params.lstm_hidden_dim, params.number_of_tags)
 46 | 
 47 |     def forward(self, s):
 48 |         """
 49 |         This function defines how we use the components of our network to operate on an input batch.
 50 | 
 51 |         Args:
 52 |             s: (Variable) contains a batch of sentences, of dimension batch_size x seq_len, where seq_len is
 53 |                the length of the longest sentence in the batch. For sentences shorter than seq_len, the remaining
 54 |                tokens are PADding tokens. Each row is a sentence with each element corresponding to the index of
 55 |                the token in the vocab.
 56 | 
 57 |         Returns:
 58 |             out: (Variable) dimension batch_size*seq_len x num_tags with the log probabilities of tokens for each token
 59 |                  of each sentence.
 60 | 
 61 |         Note: the dimensions after each step are provided
 62 |         """
 63 |         #                                -> batch_size x seq_len
 64 |         # apply the embedding layer that maps each token to its embedding
 65 |         # dim: batch_size x seq_len x embedding_dim
 66 |         s = self.embedding(s)
 67 | 
 68 |         # run the LSTM along the sentences of length seq_len
 69 |         # dim: batch_size x seq_len x lstm_hidden_dim
 70 |         s, _ = self.lstm(s)
 71 | 
 72 |         # make the Variable contiguous in memory (a PyTorch artefact)
 73 |         s = s.contiguous()
 74 | 
 75 |         # reshape the Variable so that each row contains one token
 76 |         # dim: batch_size*seq_len x lstm_hidden_dim
 77 |         s = s.view(-1, s.shape[2])
 78 | 
 79 |         # apply the fully connected layer and obtain the output (before softmax) for each token
 80 |         s = self.fc(s)                   # dim: batch_size*seq_len x num_tags
 81 | 
 82 |         # apply log softmax on each token's output (this is recommended over applying softmax
 83 |         # since it is numerically more stable)
 84 |         return F.log_softmax(s, dim=1)   # dim: batch_size*seq_len x num_tags
 85 | 
 86 | 
 87 | def loss_fn(outputs, labels):
 88 |     """
 89 |     Compute the cross entropy loss given outputs from the model and labels for all tokens. Exclude loss terms
 90 |     for PADding tokens.
 91 | 
 92 |     Args:
 93 |         outputs: (Variable) dimension batch_size*seq_len x num_tags - log softmax output of the model
 94 |         labels: (Variable) dimension batch_size x seq_len where each element is either a label in [0, 1, ... num_tag-1],
 95 |                 or -1 in case it is a PADding token.
 96 | 
 97 |     Returns:
 98 |         loss: (Variable) cross entropy loss for all tokens in the batch
 99 | 
100 |     Note: you may use a standard loss function from http://pytorch.org/docs/master/nn.html#loss-functions. This example
101 |           demonstrates how you can easily define a custom loss function.
102 |     """
103 | 
104 |     # reshape labels to give a flat vector of length batch_size*seq_len
105 |     labels = labels.view(-1)
106 | 
107 |     # since PADding tokens have label -1, we can generate a mask to exclude the loss from those terms
108 |     mask = (labels >= 0).float()
109 | 
110 |     # indexing with negative values is not supported. Since PADded tokens have label -1, we convert them to a positive
111 |     # number. This does not affect training, since we ignore the PADded tokens with the mask.
112 |     labels = labels % outputs.shape[1]
113 | 
114 |     num_tokens = int(torch.sum(mask))
115 | 
116 |     # compute cross entropy loss for all tokens (except PADding tokens), by multiplying with mask.
117 |     return -torch.sum(outputs[range(outputs.shape[0]), labels]*mask)/num_tokens
118 | 
119 | 
120 | def accuracy(outputs, labels):
121 |     """
122 |     Compute the accuracy, given the outputs and labels for all tokens. Exclude PADding terms.
123 | 
124 |     Args:
125 |         outputs: (np.ndarray) dimension batch_size*seq_len x num_tags - log softmax output of the model
126 |         labels: (np.ndarray) dimension batch_size x seq_len where each element is either a label in
127 |                 [0, 1, ... num_tag-1], or -1 in case it is a PADding token.
128 | 
129 |     Returns: (float) accuracy in [0,1]
130 |     """
131 | 
132 |     # reshape labels to give a flat vector of length batch_size*seq_len
133 |     labels = labels.ravel()
134 | 
135 |     # since PADding tokens have label -1, we can generate a mask to exclude the loss from those terms
136 |     mask = (labels >= 0)
137 | 
138 |     # np.argmax gives us the class predicted for each token by the model
139 |     outputs = np.argmax(outputs, axis=1)
140 | 
141 |     # compare outputs with labels and divide by number of tokens (excluding PADding tokens)
142 |     return np.sum(outputs == labels)/float(np.sum(mask))
143 | 
144 | 
145 | # maintain all metrics required in this dictionary- these are used in the training and evaluation loops
146 | metrics = {
147 |     'accuracy': accuracy,
148 |     # could add more metrics such as accuracy for each token type
149 | }
150 | 


--------------------------------------------------------------------------------
/pytorch/nlp/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | Pillow
3 | torch>=1.2
4 | tabulate
5 | tqdm
6 | 


--------------------------------------------------------------------------------
/pytorch/nlp/search_hyperparams.py:
--------------------------------------------------------------------------------
 1 | """Peform hyperparemeters search"""
 2 | 
 3 | import argparse
 4 | import os
 5 | from subprocess import check_call
 6 | import sys
 7 | 
 8 | import utils
 9 | 
10 | 
11 | PYTHON = sys.executable
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--parent_dir', default='experiments/learning_rate',
14 |                     help='Directory containing params.json')
15 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset")
16 | 
17 | 
18 | def launch_training_job(parent_dir, data_dir, job_name, params):
19 |     """Launch training of the model with a set of hyperparameters in parent_dir/job_name
20 | 
21 |     Args:
22 |         model_dir: (string) directory containing config, weights and log
23 |         data_dir: (string) directory containing the dataset
24 |         params: (dict) containing hyperparameters
25 |     """
26 |     # Create a new folder in parent_dir with unique_name "job_name"
27 |     model_dir = os.path.join(parent_dir, job_name)
28 |     if not os.path.exists(model_dir):
29 |         os.makedirs(model_dir)
30 | 
31 |     # Write parameters in json file
32 |     json_path = os.path.join(model_dir, 'params.json')
33 |     params.save(json_path)
34 | 
35 |     # Launch training with this config
36 |     cmd = "{python} train.py --model_dir={model_dir} --data_dir {data_dir}".format(python=PYTHON, model_dir=model_dir,
37 |                                                                                    data_dir=data_dir)
38 |     print(cmd)
39 |     check_call(cmd, shell=True)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     # Load the "reference" parameters from parent_dir json file
44 |     args = parser.parse_args()
45 |     json_path = os.path.join(args.parent_dir, 'params.json')
46 |     assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
47 |     params = utils.Params(json_path)
48 | 
49 |     # Perform hypersearch over one parameter
50 |     learning_rates = [1e-4, 1e-3, 1e-2]
51 | 
52 |     for learning_rate in learning_rates:
53 |         # Modify the relevant parameter in params
54 |         params.learning_rate = learning_rate
55 | 
56 |         # Launch job (name has to be unique)
57 |         job_name = "learning_rate_{}".format(learning_rate)
58 |         launch_training_job(args.parent_dir, args.data_dir, job_name, params)
59 | 


--------------------------------------------------------------------------------
/pytorch/nlp/synthesize_results.py:
--------------------------------------------------------------------------------
 1 | """Aggregates results from the metrics_eval_best_weights.json in a parent folder"""
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | 
 7 | from tabulate import tabulate
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--parent_dir', default='experiments',
12 |                     help='Directory containing results of experiments')
13 | 
14 | 
15 | def aggregate_metrics(parent_dir, metrics):
16 |     """Aggregate the metrics of all experiments in folder `parent_dir`.
17 | 
18 |     Assumes that `parent_dir` contains multiple experiments, with their results stored in
19 |     `parent_dir/subdir/metrics_dev.json`
20 | 
21 |     Args:
22 |         parent_dir: (string) path to directory containing experiments results
23 |         metrics: (dict) subdir -> {'accuracy': ..., ...}
24 |     """
25 |     # Get the metrics for the folder if it has results from an experiment
26 |     metrics_file = os.path.join(parent_dir, 'metrics_val_best_weights.json')
27 |     if os.path.isfile(metrics_file):
28 |         with open(metrics_file, 'r') as f:
29 |             metrics[parent_dir] = json.load(f)
30 | 
31 |     # Check every subdirectory of parent_dir
32 |     for subdir in os.listdir(parent_dir):
33 |         if not os.path.isdir(os.path.join(parent_dir, subdir)):
34 |             continue
35 |         else:
36 |             aggregate_metrics(os.path.join(parent_dir, subdir), metrics)
37 | 
38 | 
39 | def metrics_to_table(metrics):
40 |     # Get the headers from the first subdir. Assumes everything has the same metrics
41 |     headers = metrics[list(metrics.keys())[0]].keys()
42 |     table = [[subdir] + [values[h] for h in headers] for subdir, values in metrics.items()]
43 |     res = tabulate(table, headers, tablefmt='pipe')
44 | 
45 |     return res
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     args = parser.parse_args()
50 | 
51 |     # Aggregate metrics from args.parent_dir directory
52 |     metrics = dict()
53 |     aggregate_metrics(args.parent_dir, metrics)
54 |     table = metrics_to_table(metrics)
55 | 
56 |     # Display the table to terminal
57 |     print(table)
58 | 
59 |     # Save results in parent_dir/results.md
60 |     save_file = os.path.join(args.parent_dir, "results.md")
61 |     with open(save_file, 'w') as f:
62 |         f.write(table)


--------------------------------------------------------------------------------
/pytorch/nlp/train.py:
--------------------------------------------------------------------------------
  1 | """Train the model"""
  2 | 
  3 | import argparse
  4 | import logging
  5 | import os
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.optim as optim
 10 | from tqdm import trange
 11 | 
 12 | import utils
 13 | import model.net as net
 14 | from model.data_loader import DataLoader
 15 | from evaluate import evaluate
 16 | 
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument('--data_dir', default='data/small',
 20 |                     help="Directory containing the dataset")
 21 | parser.add_argument('--model_dir', default='experiments/base_model',
 22 |                     help="Directory containing params.json")
 23 | parser.add_argument('--restore_file', default=None,
 24 |                     help="Optional, name of the file in --model_dir containing weights to reload before \
 25 |                     training")  # 'best' or 'train'
 26 | 
 27 | 
 28 | def train(model, optimizer, loss_fn, data_iterator, metrics, params, num_steps):
 29 |     """Train the model on `num_steps` batches
 30 | 
 31 |     Args:
 32 |         model: (torch.nn.Module) the neural network
 33 |         optimizer: (torch.optim) optimizer for parameters of model
 34 |         loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
 35 |         data_iterator: (generator) a generator that generates batches of data and labels
 36 |         metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
 37 |         params: (Params) hyperparameters
 38 |         num_steps: (int) number of batches to train on, each of size params.batch_size
 39 |     """
 40 | 
 41 |     # set model to training mode
 42 |     model.train()
 43 | 
 44 |     # summary for current training loop and a running average object for loss
 45 |     summ = []
 46 |     loss_avg = utils.RunningAverage()
 47 | 
 48 |     # Use tqdm for progress bar
 49 |     t = trange(num_steps)
 50 |     for i in t:
 51 |         # fetch the next training batch
 52 |         train_batch, labels_batch = next(data_iterator)
 53 | 
 54 |         # compute model output and loss
 55 |         output_batch = model(train_batch)
 56 |         loss = loss_fn(output_batch, labels_batch)
 57 | 
 58 |         # clear previous gradients, compute gradients of all variables wrt loss
 59 |         optimizer.zero_grad()
 60 |         loss.backward()
 61 | 
 62 |         # performs updates using calculated gradients
 63 |         optimizer.step()
 64 | 
 65 |         # Evaluate summaries only once in a while
 66 |         if i % params.save_summary_steps == 0:
 67 |             # extract data from torch Variable, move to cpu, convert to numpy arrays
 68 |             output_batch = output_batch.data.cpu().numpy()
 69 |             labels_batch = labels_batch.data.cpu().numpy()
 70 | 
 71 |             # compute all metrics on this batch
 72 |             summary_batch = {metric: metrics[metric](output_batch, labels_batch)
 73 |                              for metric in metrics}
 74 |             summary_batch['loss'] = loss.item()
 75 |             summ.append(summary_batch)
 76 | 
 77 |         # update the average loss
 78 |         loss_avg.update(loss.item())
 79 |         t.set_postfix(loss='{:05.3f}'.format(loss_avg()))
 80 | 
 81 |     # compute mean of all metrics in summary
 82 |     metrics_mean = {metric: np.mean([x[metric]
 83 |                                      for x in summ]) for metric in summ[0]}
 84 |     metrics_string = " ; ".join("{}: {:05.3f}".format(k, v)
 85 |                                 for k, v in metrics_mean.items())
 86 |     logging.info("- Train metrics: " + metrics_string)
 87 | 
 88 | 
 89 | def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, restore_file=None):
 90 |     """Train the model and evaluate every epoch.
 91 | 
 92 |     Args:
 93 |         model: (torch.nn.Module) the neural network
 94 |         train_data: (dict) training data with keys 'data' and 'labels'
 95 |         val_data: (dict) validaion data with keys 'data' and 'labels'
 96 |         optimizer: (torch.optim) optimizer for parameters of model
 97 |         loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
 98 |         metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
 99 |         params: (Params) hyperparameters
100 |         model_dir: (string) directory containing config, weights and log
101 |         restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
102 |     """
103 |     # reload weights from restore_file if specified
104 |     if restore_file is not None:
105 |         restore_path = os.path.join(
106 |             args.model_dir, args.restore_file + '.pth.tar')
107 |         logging.info("Restoring parameters from {}".format(restore_path))
108 |         utils.load_checkpoint(restore_path, model, optimizer)
109 | 
110 |     best_val_acc = 0.0
111 | 
112 |     for epoch in range(params.num_epochs):
113 |         # Run one epoch
114 |         logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))
115 | 
116 |         # compute number of batches in one epoch (one full pass over the training set)
117 |         num_steps = (params.train_size + 1) // params.batch_size
118 |         train_data_iterator = data_loader.data_iterator(
119 |             train_data, params, shuffle=True)
120 |         train(model, optimizer, loss_fn, train_data_iterator,
121 |               metrics, params, num_steps)
122 | 
123 |         # Evaluate for one epoch on validation set
124 |         num_steps = (params.val_size + 1) // params.batch_size
125 |         val_data_iterator = data_loader.data_iterator(
126 |             val_data, params, shuffle=False)
127 |         val_metrics = evaluate(
128 |             model, loss_fn, val_data_iterator, metrics, params, num_steps)
129 | 
130 |         val_acc = val_metrics['accuracy']
131 |         is_best = val_acc >= best_val_acc
132 | 
133 |         # Save weights
134 |         utils.save_checkpoint({'epoch': epoch + 1,
135 |                                'state_dict': model.state_dict(),
136 |                                'optim_dict': optimizer.state_dict()},
137 |                               is_best=is_best,
138 |                               checkpoint=model_dir)
139 | 
140 |         # If best_eval, best_save_path
141 |         if is_best:
142 |             logging.info("- Found new best accuracy")
143 |             best_val_acc = val_acc
144 | 
145 |             # Save best val metrics in a json file in the model directory
146 |             best_json_path = os.path.join(
147 |                 model_dir, "metrics_val_best_weights.json")
148 |             utils.save_dict_to_json(val_metrics, best_json_path)
149 | 
150 |         # Save latest val metrics in a json file in the model directory
151 |         last_json_path = os.path.join(
152 |             model_dir, "metrics_val_last_weights.json")
153 |         utils.save_dict_to_json(val_metrics, last_json_path)
154 | 
155 | 
156 | if __name__ == '__main__':
157 | 
158 |     # Load the parameters from json file
159 |     args = parser.parse_args()
160 |     json_path = os.path.join(args.model_dir, 'params.json')
161 |     assert os.path.isfile(
162 |         json_path), "No json configuration file found at {}".format(json_path)
163 |     params = utils.Params(json_path)
164 | 
165 |     # use GPU if available
166 |     params.cuda = torch.cuda.is_available()
167 | 
168 |     # Set the random seed for reproducible experiments
169 |     torch.manual_seed(230)
170 |     if params.cuda:
171 |         torch.cuda.manual_seed(230)
172 | 
173 |     # Set the logger
174 |     utils.set_logger(os.path.join(args.model_dir, 'train.log'))
175 | 
176 |     # Create the input data pipeline
177 |     logging.info("Loading the datasets...")
178 | 
179 |     # load data
180 |     data_loader = DataLoader(args.data_dir, params)
181 |     data = data_loader.load_data(['train', 'val'], args.data_dir)
182 |     train_data = data['train']
183 |     val_data = data['val']
184 | 
185 |     # specify the train and val dataset sizes
186 |     params.train_size = train_data['size']
187 |     params.val_size = val_data['size']
188 | 
189 |     logging.info("- done.")
190 | 
191 |     # Define the model and optimizer
192 |     model = net.Net(params).cuda() if params.cuda else net.Net(params)
193 |     optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)
194 | 
195 |     # fetch loss function and metrics
196 |     loss_fn = net.loss_fn
197 |     metrics = net.metrics
198 | 
199 |     # Train the model
200 |     logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
201 |     train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, args.model_dir,
202 |                        args.restore_file)
203 | 


--------------------------------------------------------------------------------
/pytorch/nlp/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import shutil
  5 | 
  6 | import torch
  7 | 
  8 | 
  9 | class Params():
 10 |     """Class that loads hyperparameters from a json file.
 11 | 
 12 |     Example:
 13 |     ```
 14 |     params = Params(json_path)
 15 |     print(params.learning_rate)
 16 |     params.learning_rate = 0.5  # change the value of learning_rate in params
 17 |     ```
 18 |     """
 19 | 
 20 |     def __init__(self, json_path):
 21 |         with open(json_path) as f:
 22 |             params = json.load(f)
 23 |             self.__dict__.update(params)
 24 | 
 25 |     def save(self, json_path):
 26 |         with open(json_path, 'w') as f:
 27 |             json.dump(self.__dict__, f, indent=4)
 28 | 
 29 |     def update(self, json_path):
 30 |         """Loads parameters from json file"""
 31 |         with open(json_path) as f:
 32 |             params = json.load(f)
 33 |             self.__dict__.update(params)
 34 | 
 35 |     @property
 36 |     def dict(self):
 37 |         """Gives dict-like access to Params instance by `params.dict['learning_rate']"""
 38 |         return self.__dict__
 39 | 
 40 | 
 41 | class RunningAverage():
 42 |     """A simple class that maintains the running average of a quantity
 43 | 
 44 |     Example:
 45 |     ```
 46 |     loss_avg = RunningAverage()
 47 |     loss_avg.update(2)
 48 |     loss_avg.update(4)
 49 |     loss_avg() = 3
 50 |     ```
 51 |     """
 52 | 
 53 |     def __init__(self):
 54 |         self.steps = 0
 55 |         self.total = 0
 56 | 
 57 |     def update(self, val):
 58 |         self.total += val
 59 |         self.steps += 1
 60 | 
 61 |     def __call__(self):
 62 |         return self.total / float(self.steps)
 63 | 
 64 | 
 65 | def set_logger(log_path):
 66 |     """Set the logger to log info in terminal and file `log_path`.
 67 | 
 68 |     In general, it is useful to have a logger so that every output to the terminal is saved
 69 |     in a permanent file. Here we save it to `model_dir/train.log`.
 70 | 
 71 |     Example:
 72 |     ```
 73 |     logging.info("Starting training...")
 74 |     ```
 75 | 
 76 |     Args:
 77 |         log_path: (string) where to log
 78 |     """
 79 |     logger = logging.getLogger()
 80 |     logger.setLevel(logging.INFO)
 81 | 
 82 |     if not logger.handlers:
 83 |         # Logging to a file
 84 |         file_handler = logging.FileHandler(log_path)
 85 |         file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
 86 |         logger.addHandler(file_handler)
 87 | 
 88 |         # Logging to console
 89 |         stream_handler = logging.StreamHandler()
 90 |         stream_handler.setFormatter(logging.Formatter('%(message)s'))
 91 |         logger.addHandler(stream_handler)
 92 | 
 93 | 
 94 | def save_dict_to_json(d, json_path):
 95 |     """Saves dict of floats in json file
 96 | 
 97 |     Args:
 98 |         d: (dict) of float-castable values (np.float, int, float, etc.)
 99 |         json_path: (string) path to json file
100 |     """
101 |     with open(json_path, 'w') as f:
102 |         # We need to convert the values to float for json (it doesn't accept np.array, np.float, )
103 |         d = {k: float(v) for k, v in d.items()}
104 |         json.dump(d, f, indent=4)
105 | 
106 | 
107 | def save_checkpoint(state, is_best, checkpoint):
108 |     """Saves model and training parameters at checkpoint + 'last.pth.tar'. If is_best==True, also saves
109 |     checkpoint + 'best.pth.tar'
110 | 
111 |     Args:
112 |         state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict
113 |         is_best: (bool) True if it is the best model seen till now
114 |         checkpoint: (string) folder where parameters are to be saved
115 |     """
116 |     filepath = os.path.join(checkpoint, 'last.pth.tar')
117 |     if not os.path.exists(checkpoint):
118 |         print("Checkpoint Directory does not exist! Making directory {}".format(checkpoint))
119 |         os.mkdir(checkpoint)
120 |     else:
121 |         print("Checkpoint Directory exists! ")
122 |     torch.save(state, filepath)
123 |     if is_best:
124 |         shutil.copyfile(filepath, os.path.join(checkpoint, 'best.pth.tar'))
125 | 
126 | 
127 | def load_checkpoint(checkpoint, model, optimizer=None):
128 |     """Loads model parameters (state_dict) from file_path. If optimizer is provided, loads state_dict of
129 |     optimizer assuming it is present in checkpoint.
130 | 
131 |     Args:
132 |         checkpoint: (string) filename which needs to be loaded
133 |         model: (torch.nn.Module) model for which the parameters are loaded
134 |         optimizer: (torch.optim) optional: resume optimizer from checkpoint
135 |     """
136 |     if not os.path.exists(checkpoint):
137 |         raise ("File doesn't exist {}".format(checkpoint))
138 |     checkpoint = torch.load(checkpoint)
139 |     model.load_state_dict(checkpoint['state_dict'])
140 | 
141 |     if optimizer:
142 |         optimizer.load_state_dict(checkpoint['optim_dict'])
143 | 
144 |     return checkpoint


--------------------------------------------------------------------------------
/pytorch/vision/README.md:
--------------------------------------------------------------------------------
  1 | # Hand Signs Recognition with PyTorch
  2 | 
  3 | *Authors: Surag Nair, Olivier Moindrot and Guillaume Genthial*
  4 | 
  5 | Take the time to read the [tutorials](https://cs230-stanford.github.io/project-starter-code.html).
  6 | 
  7 | Note: all scripts must be run in folder `pytorch/vision`.
  8 | 
  9 | ## Requirements
 10 | 
 11 | We recommend using python3 and a virtual env. See instructions [here](https://cs230-stanford.github.io/project-starter-code.html).
 12 | 
 13 | ```
 14 | virtualenv -p python3 .env
 15 | source .env/bin/activate
 16 | pip install -r requirements.txt
 17 | ```
 18 | 
 19 | When you're done working on the project, deactivate the virtual environment with `deactivate`.
 20 | 
 21 | ## Task
 22 | 
 23 | Given an image of a hand doing a sign representing 0, 1, 2, 3, 4 or 5, predict the correct label.
 24 | 
 25 | 
 26 | ## Download the SIGNS dataset
 27 | 
 28 | For the vision example, we will used the SIGNS dataset created for this class. The dataset is hosted on google drive, download it [here][SIGNS].
 29 | 
 30 | This will download the SIGNS dataset (~1.1 GB) containing photos of hands signs making numbers between 0 and 5.
 31 | Here is the structure of the data:
 32 | ```
 33 | SIGNS/
 34 |     train_signs/
 35 |         0_IMG_5864.jpg
 36 |         ...
 37 |     test_signs/
 38 |         0_IMG_5942.jpg
 39 |         ...
 40 | ```
 41 | 
 42 | The images are named following `{label}_IMG_{id}.jpg` where the label is in `[0, 5]`.
 43 | The training set contains 1,080 images and the test set contains 120 images.
 44 | 
 45 | Once the download is complete, move the dataset into `data/SIGNS`.
 46 | Run the script `build_dataset.py` which will resize the images to size `(64, 64)`. The new resized dataset will be located by default in `data/64x64_SIGNS`:
 47 | 
 48 | ```bash
 49 | python build_dataset.py --data_dir data/SIGNS --output_dir data/64x64_SIGNS
 50 | ```
 51 | 
 52 | 
 53 | 
 54 | ## Quickstart (~10 min)
 55 | 
 56 | 1. __Build the dataset of size 64x64__: make sure you complete this step before training
 57 | ```bash
 58 | python build_dataset.py --data_dir data/SIGNS --output_dir data/64x64_SIGNS
 59 | ```
 60 | 
 61 | 2. __Your first experiment__ We created a `base_model` directory for you under the `experiments` directory. It contains a file `params.json` which sets the hyperparameters for the experiment. It looks like
 62 | ```json
 63 | {
 64 |     "learning_rate": 1e-3,
 65 |     "batch_size": 32,
 66 |     "num_epochs": 10,
 67 |     ...
 68 | }
 69 | ```
 70 | For every new experiment, you will need to create a new directory under `experiments` with a similar `params.json` file.
 71 | 
 72 | 3. __Train__ your experiment. Simply run
 73 | ```
 74 | python train.py --data_dir data/64x64_SIGNS --model_dir experiments/base_model
 75 | ```
 76 | It will instantiate a model and train it on the training set following the hyperparameters specified in `params.json`. It will also evaluate some metrics on the validation set.
 77 | 
 78 | 4. __Your first hyperparameters search__ We created a new directory `learning_rate` in `experiments` for you. Now, run
 79 | ```
 80 | python search_hyperparams.py --data_dir data/64x64_SIGNS --parent_dir experiments/learning_rate
 81 | ```
 82 | It will train and evaluate a model with different values of learning rate defined in `search_hyperparams.py` and create a new directory for each experiment under `experiments/learning_rate/`.
 83 | 
 84 | 5. __Display the results__ of the hyperparameters search in a nice format
 85 | ```
 86 | python synthesize_results.py --parent_dir experiments/learning_rate
 87 | ```
 88 | 
 89 | 6. __Evaluation on the test set__ Once you've run many experiments and selected your best model and hyperparameters based on the performance on the validation set, you can finally evaluate the performance of your model on the test set. Run
 90 | ```
 91 | python evaluate.py --data_dir data/64x64_SIGNS --model_dir experiments/base_model
 92 | ```
 93 | 
 94 | 
 95 | ## Guidelines for more advanced use
 96 | 
 97 | We recommend reading through `train.py` to get a high-level overview of the training loop steps:
 98 | - loading the hyperparameters for the experiment (the `params.json`)
 99 | - loading the training and validation data
100 | - creating the model, loss_fn and metrics
101 | - training the model for a given number of epochs by calling `train_and_evaluate(...)`
102 | 
103 | You can then have a look at `data_loader.py` to understand:
104 | - how jpg images are loaded and transformed to torch Tensors
105 | - how the `data_iterator` creates a batch of data and labels and pads sentences
106 | 
107 | Once you get the high-level idea, depending on your dataset, you might want to modify
108 | - `model/net.py` to change the neural network, loss function and metrics
109 | - `model/data_loader.py` to suit the data loader to your specific needs
110 | - `train.py` for changing the optimizer
111 | - `train.py` and `evaluate.py` for some changes in the model or input require changes here
112 | 
113 | Once you get something working for your dataset, feel free to edit any part of the code to suit your own needs.
114 | 
115 | ## Resources
116 | 
117 | - [PyTorch documentation](http://pytorch.org/docs/0.3.0/)
118 | - [Tutorials](http://pytorch.org/tutorials/)
119 | - [PyTorch warm-up](https://github.com/jcjohnson/pytorch-examples)
120 | 
121 | [SIGNS]: https://drive.google.com/file/d/1ufiR6hUKhXoAyiBNsySPkUwlvE_wfEHC/view?usp=sharing
122 | 


--------------------------------------------------------------------------------
/pytorch/vision/build_dataset.py:
--------------------------------------------------------------------------------
 1 | """Split the SIGNS dataset into train/val/test and resize images to 64x64.
 2 | 
 3 | The SIGNS dataset comes into the following format:
 4 |     train_signs/
 5 |         0_IMG_5864.jpg
 6 |         ...
 7 |     test_signs/
 8 |         0_IMG_5942.jpg
 9 |         ...
10 | 
11 | Original images have size (3024, 3024).
12 | Resizing to (64, 64) reduces the dataset size from 1.16 GB to 4.7 MB, and loading smaller images
13 | makes training faster.
14 | 
15 | We already have a test set created, so we only need to split "train_signs" into train and val sets.
16 | Because we don't have a lot of images and we want that the statistics on the val set be as
17 | representative as possible, we'll take 20% of "train_signs" as val set.
18 | """
19 | 
20 | import argparse
21 | import random
22 | import os
23 | 
24 | from PIL import Image
25 | from tqdm import tqdm
26 | 
27 | SIZE = 64
28 | 
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument('--data_dir', default='data/SIGNS', help="Directory with the SIGNS dataset")
31 | parser.add_argument('--output_dir', default='data/64x64_SIGNS', help="Where to write the new data")
32 | 
33 | 
34 | def resize_and_save(filename, output_dir, size=SIZE):
35 |     """Resize the image contained in `filename` and save it to the `output_dir`"""
36 |     image = Image.open(filename)
37 |     # Use bilinear interpolation instead of the default "nearest neighbor" method
38 |     image = image.resize((size, size), Image.BILINEAR)
39 |     image.save(os.path.join(output_dir, filename.split('/')[-1]))
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     args = parser.parse_args()
44 | 
45 |     assert os.path.isdir(args.data_dir), "Couldn't find the dataset at {}".format(args.data_dir)
46 | 
47 |     # Define the data directories
48 |     train_data_dir = os.path.join(args.data_dir, 'train_signs')
49 |     test_data_dir = os.path.join(args.data_dir, 'test_signs')
50 | 
51 |     # Get the filenames in each directory (train and test)
52 |     filenames = os.listdir(train_data_dir)
53 |     filenames = [os.path.join(train_data_dir, f) for f in filenames if f.endswith('.jpg')]
54 | 
55 |     test_filenames = os.listdir(test_data_dir)
56 |     test_filenames = [os.path.join(test_data_dir, f) for f in test_filenames if f.endswith('.jpg')]
57 | 
58 |     # Split the images in 'train_signs' into 80% train and 20% val
59 |     # Make sure to always shuffle with a fixed seed so that the split is reproducible
60 |     random.seed(230)
61 |     filenames.sort()
62 |     random.shuffle(filenames)
63 | 
64 |     split = int(0.8 * len(filenames))
65 |     train_filenames = filenames[:split]
66 |     val_filenames = filenames[split:]
67 | 
68 |     filenames = {'train': train_filenames,
69 |                  'val': val_filenames,
70 |                  'test': test_filenames}
71 | 
72 |     if not os.path.exists(args.output_dir):
73 |         os.mkdir(args.output_dir)
74 |     else:
75 |         print("Warning: output dir {} already exists".format(args.output_dir))
76 | 
77 |     # Preprocess train, val and test
78 |     for split in ['train', 'val', 'test']:
79 |         output_dir_split = os.path.join(args.output_dir, '{}_signs'.format(split))
80 |         if not os.path.exists(output_dir_split):
81 |             os.mkdir(output_dir_split)
82 |         else:
83 |             print("Warning: dir {} already exists".format(output_dir_split))
84 | 
85 |         print("Processing {} data, saving preprocessed data to {}".format(split, output_dir_split))
86 |         for filename in tqdm(filenames[split]):
87 |             resize_and_save(filename, output_dir_split, size=SIZE)
88 | 
89 |     print("Done building dataset")
90 | 


--------------------------------------------------------------------------------
/pytorch/vision/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/vision/data/.gitkeep


--------------------------------------------------------------------------------
/pytorch/vision/evaluate.py:
--------------------------------------------------------------------------------
  1 | """Evaluates the model"""
  2 | 
  3 | import argparse
  4 | import logging
  5 | import os
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch.autograd import Variable
 10 | import utils
 11 | import model.net as net
 12 | import model.data_loader as data_loader
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('--data_dir', default='data/64x64_SIGNS',
 16 |                     help="Directory containing the dataset")
 17 | parser.add_argument('--model_dir', default='experiments/base_model',
 18 |                     help="Directory containing params.json")
 19 | parser.add_argument('--restore_file', default='best', help="name of the file in --model_dir \
 20 |                      containing weights to load")
 21 | 
 22 | 
 23 | def evaluate(model, loss_fn, dataloader, metrics, params):
 24 |     """Evaluate the model on `num_steps` batches.
 25 | 
 26 |     Args:
 27 |         model: (torch.nn.Module) the neural network
 28 |         loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
 29 |         dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches data
 30 |         metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
 31 |         params: (Params) hyperparameters
 32 |         num_steps: (int) number of batches to train on, each of size params.batch_size
 33 |     """
 34 | 
 35 |     # set model to evaluation mode
 36 |     model.eval()
 37 | 
 38 |     # summary for current eval loop
 39 |     summ = []
 40 | 
 41 |     # compute metrics over the dataset
 42 |     for data_batch, labels_batch in dataloader:
 43 | 
 44 |         # move to GPU if available
 45 |         if params.cuda:
 46 |             data_batch, labels_batch = data_batch.cuda(
 47 |                 non_blocking=True), labels_batch.cuda(non_blocking=True)
 48 |         # fetch the next evaluation batch
 49 |         data_batch, labels_batch = Variable(data_batch), Variable(labels_batch)
 50 | 
 51 |         # compute model output
 52 |         output_batch = model(data_batch)
 53 |         loss = loss_fn(output_batch, labels_batch)
 54 | 
 55 |         # extract data from torch Variable, move to cpu, convert to numpy arrays
 56 |         output_batch = output_batch.data.cpu().numpy()
 57 |         labels_batch = labels_batch.data.cpu().numpy()
 58 | 
 59 |         # compute all metrics on this batch
 60 |         summary_batch = {metric: metrics[metric](output_batch, labels_batch)
 61 |                          for metric in metrics}
 62 |         summary_batch['loss'] = loss.item()
 63 |         summ.append(summary_batch)
 64 | 
 65 |     # compute mean of all metrics in summary
 66 |     metrics_mean = {metric: np.mean([x[metric]
 67 |                                      for x in summ]) for metric in summ[0]}
 68 |     metrics_string = " ; ".join("{}: {:05.3f}".format(k, v)
 69 |                                 for k, v in metrics_mean.items())
 70 |     logging.info("- Eval metrics : " + metrics_string)
 71 |     return metrics_mean
 72 | 
 73 | 
 74 | if __name__ == '__main__':
 75 |     """
 76 |         Evaluate the model on the test set.
 77 |     """
 78 |     # Load the parameters
 79 |     args = parser.parse_args()
 80 |     json_path = os.path.join(args.model_dir, 'params.json')
 81 |     assert os.path.isfile(
 82 |         json_path), "No json configuration file found at {}".format(json_path)
 83 |     params = utils.Params(json_path)
 84 | 
 85 |     # use GPU if available
 86 |     params.cuda = torch.cuda.is_available()     # use GPU is available
 87 | 
 88 |     # Set the random seed for reproducible experiments
 89 |     torch.manual_seed(230)
 90 |     if params.cuda:
 91 |         torch.cuda.manual_seed(230)
 92 | 
 93 |     # Get the logger
 94 |     utils.set_logger(os.path.join(args.model_dir, 'evaluate.log'))
 95 | 
 96 |     # Create the input data pipeline
 97 |     logging.info("Creating the dataset...")
 98 | 
 99 |     # fetch dataloaders
100 |     dataloaders = data_loader.fetch_dataloader(['test'], args.data_dir, params)
101 |     test_dl = dataloaders['test']
102 | 
103 |     logging.info("- done.")
104 | 
105 |     # Define the model
106 |     model = net.Net(params).cuda() if params.cuda else net.Net(params)
107 | 
108 |     loss_fn = net.loss_fn
109 |     metrics = net.metrics
110 | 
111 |     logging.info("Starting evaluation")
112 | 
113 |     # Reload weights from the saved file
114 |     utils.load_checkpoint(os.path.join(
115 |         args.model_dir, args.restore_file + '.pth.tar'), model)
116 | 
117 |     # Evaluate
118 |     test_metrics = evaluate(model, loss_fn, test_dl, metrics, params)
119 |     save_path = os.path.join(
120 |         args.model_dir, "metrics_test_{}.json".format(args.restore_file))
121 |     utils.save_dict_to_json(test_metrics, save_path)
122 | 


--------------------------------------------------------------------------------
/pytorch/vision/experiments/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/vision/experiments/.gitkeep


--------------------------------------------------------------------------------
/pytorch/vision/experiments/base_model/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "learning_rate": 1e-3,
 3 |     "batch_size": 32,
 4 |     "num_epochs": 10,
 5 |     "dropout_rate":0.8, 
 6 |     "num_channels": 32,
 7 |     "save_summary_steps": 100,
 8 |     "num_workers": 4
 9 | }
10 | 


--------------------------------------------------------------------------------
/pytorch/vision/experiments/learning_rate/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "learning_rate": 1e-3,
 3 |     "batch_size": 32,
 4 |     "num_epochs": 10,
 5 |     "dropout_rate":0.8, 
 6 |     "num_channels": 32,
 7 |     "save_summary_steps": 100,
 8 |     "num_workers": 4
 9 | }
10 | 


--------------------------------------------------------------------------------
/pytorch/vision/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/pytorch/vision/model/__init__.py


--------------------------------------------------------------------------------
/pytorch/vision/model/data_loader.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import os
 3 | 
 4 | from PIL import Image
 5 | from torch.utils.data import Dataset, DataLoader
 6 | import torchvision.transforms as transforms
 7 | 
 8 | # borrowed from http://pytorch.org/tutorials/advanced/neural_style_tutorial.html
 9 | # and http://pytorch.org/tutorials/beginner/data_loading_tutorial.html
10 | # define a training image loader that specifies transforms on images. See documentation for more details.
11 | train_transformer = transforms.Compose([
12 |     transforms.Resize(64),  # resize the image to 64x64 (remove if images are already 64x64)
13 |     transforms.RandomHorizontalFlip(),  # randomly flip image horizontally
14 |     transforms.ToTensor()])  # transform it into a torch tensor
15 | 
16 | # loader for evaluation, no horizontal flip
17 | eval_transformer = transforms.Compose([
18 |     transforms.Resize(64),  # resize the image to 64x64 (remove if images are already 64x64)
19 |     transforms.ToTensor()])  # transform it into a torch tensor
20 | 
21 | 
22 | class SIGNSDataset(Dataset):
23 |     """
24 |     A standard PyTorch definition of Dataset which defines the functions __len__ and __getitem__.
25 |     """
26 |     def __init__(self, data_dir, transform):
27 |         """
28 |         Store the filenames of the jpgs to use. Specifies transforms to apply on images.
29 | 
30 |         Args:
31 |             data_dir: (string) directory containing the dataset
32 |             transform: (torchvision.transforms) transformation to apply on image
33 |         """
34 |         self.filenames = os.listdir(data_dir)
35 |         self.filenames = [os.path.join(data_dir, f) for f in self.filenames if f.endswith('.jpg')]
36 | 
37 |         self.labels = [int(os.path.split(filename)[-1][0]) for filename in self.filenames]
38 |         self.transform = transform
39 | 
40 |     def __len__(self):
41 |         # return size of dataset
42 |         return len(self.filenames)
43 | 
44 |     def __getitem__(self, idx):
45 |         """
46 |         Fetch index idx image and labels from dataset. Perform transforms on image.
47 | 
48 |         Args:
49 |             idx: (int) index in [0, 1, ..., size_of_dataset-1]
50 | 
51 |         Returns:
52 |             image: (Tensor) transformed image
53 |             label: (int) corresponding label of image
54 |         """
55 |         image = Image.open(self.filenames[idx])  # PIL image
56 |         image = self.transform(image)
57 |         return image, self.labels[idx]
58 | 
59 | 
60 | def fetch_dataloader(types, data_dir, params):
61 |     """
62 |     Fetches the DataLoader object for each type in types from data_dir.
63 | 
64 |     Args:
65 |         types: (list) has one or more of 'train', 'val', 'test' depending on which data is required
66 |         data_dir: (string) directory containing the dataset
67 |         params: (Params) hyperparameters
68 | 
69 |     Returns:
70 |         data: (dict) contains the DataLoader object for each type in types
71 |     """
72 |     dataloaders = {}
73 | 
74 |     for split in ['train', 'val', 'test']:
75 |         if split in types:
76 |             path = os.path.join(data_dir, "{}_signs".format(split))
77 | 
78 |             # use the train_transformer if training data, else use eval_transformer without random flip
79 |             if split == 'train':
80 |                 dl = DataLoader(SIGNSDataset(path, train_transformer), batch_size=params.batch_size, shuffle=True,
81 |                                         num_workers=params.num_workers,
82 |                                         pin_memory=params.cuda)
83 |             else:
84 |                 dl = DataLoader(SIGNSDataset(path, eval_transformer), batch_size=params.batch_size, shuffle=False,
85 |                                 num_workers=params.num_workers,
86 |                                 pin_memory=params.cuda)
87 | 
88 |             dataloaders[split] = dl
89 | 
90 |     return dataloaders
91 | 


--------------------------------------------------------------------------------
/pytorch/vision/model/net.py:
--------------------------------------------------------------------------------
  1 | """Defines the neural network, losss function and metrics"""
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | class Net(nn.Module):
 10 |     """
 11 |     This is the standard way to define your own network in PyTorch. You typically choose the components
 12 |     (e.g. LSTMs, linear layers etc.) of your network in the __init__ function. You then apply these layers
 13 |     on the input step-by-step in the forward function. You can use torch.nn.functional to apply functions
 14 | 
 15 |     such as F.relu, F.sigmoid, F.softmax, F.max_pool2d. Be careful to ensure your dimensions are correct after each
 16 |     step. You are encouraged to have a look at the network in pytorch/nlp/model/net.py to get a better sense of how
 17 |     you can go about defining your own network.
 18 | 
 19 |     The documentation for all the various components available o you is here: http://pytorch.org/docs/master/nn.html
 20 |     """
 21 | 
 22 |     def __init__(self, params):
 23 |         """
 24 |         We define an convolutional network that predicts the sign from an image. The components
 25 |         required are:
 26 | 
 27 |         - an embedding layer: this layer maps each index in range(params.vocab_size) to a params.embedding_dim vector
 28 |         - lstm: applying the LSTM on the sequential input returns an output for each token in the sentence
 29 |         - fc: a fully connected layer that converts the LSTM output for each token to a distribution over NER tags
 30 | 
 31 |         Args:
 32 |             params: (Params) contains num_channels
 33 |         """
 34 |         super(Net, self).__init__()
 35 |         self.num_channels = params.num_channels
 36 |         
 37 |         # each of the convolution layers below have the arguments (input_channels, output_channels, filter_size,
 38 |         # stride, padding). We also include batch normalisation layers that help stabilise training.
 39 |         # For more details on how to use these layers, check out the documentation.
 40 |         self.conv1 = nn.Conv2d(3, self.num_channels, 3, stride=1, padding=1)
 41 |         self.bn1 = nn.BatchNorm2d(self.num_channels)
 42 |         self.conv2 = nn.Conv2d(self.num_channels, self.num_channels*2, 3, stride=1, padding=1)
 43 |         self.bn2 = nn.BatchNorm2d(self.num_channels*2)
 44 |         self.conv3 = nn.Conv2d(self.num_channels*2, self.num_channels*4, 3, stride=1, padding=1)
 45 |         self.bn3 = nn.BatchNorm2d(self.num_channels*4)
 46 | 
 47 |         # 2 fully connected layers to transform the output of the convolution layers to the final output
 48 |         self.fc1 = nn.Linear(8*8*self.num_channels*4, self.num_channels*4)
 49 |         self.fcbn1 = nn.BatchNorm1d(self.num_channels*4)
 50 |         self.fc2 = nn.Linear(self.num_channels*4, 6)       
 51 |         self.dropout_rate = params.dropout_rate
 52 | 
 53 |     def forward(self, s):
 54 |         """
 55 |         This function defines how we use the components of our network to operate on an input batch.
 56 | 
 57 |         Args:
 58 |             s: (Variable) contains a batch of images, of dimension batch_size x 3 x 64 x 64 .
 59 | 
 60 |         Returns:
 61 |             out: (Variable) dimension batch_size x 6 with the log probabilities for the labels of each image.
 62 | 
 63 |         Note: the dimensions after each step are provided
 64 |         """
 65 |         #                                                  -> batch_size x 3 x 64 x 64
 66 |         # we apply the convolution layers, followed by batch normalisation, maxpool and relu x 3
 67 |         s = self.bn1(self.conv1(s))                         # batch_size x num_channels x 64 x 64
 68 |         s = F.relu(F.max_pool2d(s, 2))                      # batch_size x num_channels x 32 x 32
 69 |         s = self.bn2(self.conv2(s))                         # batch_size x num_channels*2 x 32 x 32
 70 |         s = F.relu(F.max_pool2d(s, 2))                      # batch_size x num_channels*2 x 16 x 16
 71 |         s = self.bn3(self.conv3(s))                         # batch_size x num_channels*4 x 16 x 16
 72 |         s = F.relu(F.max_pool2d(s, 2))                      # batch_size x num_channels*4 x 8 x 8
 73 | 
 74 |         # flatten the output for each image
 75 |         s = s.view(-1, 8*8*self.num_channels*4)             # batch_size x 8*8*num_channels*4
 76 | 
 77 |         # apply 2 fully connected layers with dropout
 78 |         s = F.dropout(F.relu(self.fcbn1(self.fc1(s))), 
 79 |             p=self.dropout_rate, training=self.training)    # batch_size x self.num_channels*4
 80 |         s = self.fc2(s)                                     # batch_size x 6
 81 | 
 82 |         # apply log softmax on each image's output (this is recommended over applying softmax
 83 |         # since it is numerically more stable)
 84 |         return F.log_softmax(s, dim=1)
 85 | 
 86 | 
 87 | def loss_fn(outputs, labels):
 88 |     """
 89 |     Compute the cross entropy loss given outputs and labels.
 90 | 
 91 |     Args:
 92 |         outputs: (Variable) dimension batch_size x 6 - output of the model
 93 |         labels: (Variable) dimension batch_size, where each element is a value in [0, 1, 2, 3, 4, 5]
 94 | 
 95 |     Returns:
 96 |         loss (Variable): cross entropy loss for all images in the batch
 97 | 
 98 |     Note: you may use a standard loss function from http://pytorch.org/docs/master/nn.html#loss-functions. This example
 99 |           demonstrates how you can easily define a custom loss function.
100 |     """
101 |     num_examples = outputs.size()[0]
102 |     return -torch.sum(outputs[range(num_examples), labels])/num_examples
103 | 
104 | 
105 | def accuracy(outputs, labels):
106 |     """
107 |     Compute the accuracy, given the outputs and labels for all images.
108 | 
109 |     Args:
110 |         outputs: (np.ndarray) dimension batch_size x 6 - log softmax output of the model
111 |         labels: (np.ndarray) dimension batch_size, where each element is a value in [0, 1, 2, 3, 4, 5]
112 | 
113 |     Returns: (float) accuracy in [0,1]
114 |     """
115 |     outputs = np.argmax(outputs, axis=1)
116 |     return np.sum(outputs==labels)/float(labels.size)
117 | 
118 | 
119 | # maintain all metrics required in this dictionary- these are used in the training and evaluation loops
120 | metrics = {
121 |     'accuracy': accuracy,
122 |     # could add more metrics such as accuracy for each token type
123 | }
124 | 


--------------------------------------------------------------------------------
/pytorch/vision/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | Pillow
3 | torch>=1.2
4 | tabulate
5 | tqdm
6 | torchvision
7 | 


--------------------------------------------------------------------------------
/pytorch/vision/search_hyperparams.py:
--------------------------------------------------------------------------------
 1 | """Peform hyperparemeters search"""
 2 | 
 3 | import argparse
 4 | import os
 5 | from subprocess import check_call
 6 | import sys
 7 | 
 8 | import utils
 9 | 
10 | 
11 | PYTHON = sys.executable
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--parent_dir', default='experiments/learning_rate',
14 |                     help='Directory containing params.json')
15 | parser.add_argument('--data_dir', default='data/64x64_SIGNS', help="Directory containing the dataset")
16 | 
17 | 
18 | def launch_training_job(parent_dir, data_dir, job_name, params):
19 |     """Launch training of the model with a set of hyperparameters in parent_dir/job_name
20 | 
21 |     Args:
22 |         model_dir: (string) directory containing config, weights and log
23 |         data_dir: (string) directory containing the dataset
24 |         params: (dict) containing hyperparameters
25 |     """
26 |     # Create a new folder in parent_dir with unique_name "job_name"
27 |     model_dir = os.path.join(parent_dir, job_name)
28 |     if not os.path.exists(model_dir):
29 |         os.makedirs(model_dir)
30 | 
31 |     # Write parameters in json file
32 |     json_path = os.path.join(model_dir, 'params.json')
33 |     params.save(json_path)
34 | 
35 |     # Launch training with this config
36 |     cmd = "{python} train.py --model_dir={model_dir} --data_dir {data_dir}".format(python=PYTHON, model_dir=model_dir,
37 |                                                                                    data_dir=data_dir)
38 |     print(cmd)
39 |     check_call(cmd, shell=True)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     # Load the "reference" parameters from parent_dir json file
44 |     args = parser.parse_args()
45 |     json_path = os.path.join(args.parent_dir, 'params.json')
46 |     assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
47 |     params = utils.Params(json_path)
48 | 
49 |     # Perform hypersearch over one parameter
50 |     learning_rates = [1e-4, 1e-3, 1e-2]
51 | 
52 |     for learning_rate in learning_rates:
53 |         # Modify the relevant parameter in params
54 |         params.learning_rate = learning_rate
55 | 
56 |         # Launch job (name has to be unique)
57 |         job_name = "learning_rate_{}".format(learning_rate)
58 |         launch_training_job(args.parent_dir, args.data_dir, job_name, params)
59 | 


--------------------------------------------------------------------------------
/pytorch/vision/synthesize_results.py:
--------------------------------------------------------------------------------
 1 | """Aggregates results from the metrics_eval_best_weights.json in a parent folder"""
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | 
 7 | from tabulate import tabulate
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--parent_dir', default='experiments',
12 |                     help='Directory containing results of experiments')
13 | 
14 | 
15 | def aggregate_metrics(parent_dir, metrics):
16 |     """Aggregate the metrics of all experiments in folder `parent_dir`.
17 | 
18 |     Assumes that `parent_dir` contains multiple experiments, with their results stored in
19 |     `parent_dir/subdir/metrics_dev.json`
20 | 
21 |     Args:
22 |         parent_dir: (string) path to directory containing experiments results
23 |         metrics: (dict) subdir -> {'accuracy': ..., ...}
24 |     """
25 |     # Get the metrics for the folder if it has results from an experiment
26 |     metrics_file = os.path.join(parent_dir, 'metrics_val_best_weights.json')
27 |     if os.path.isfile(metrics_file):
28 |         with open(metrics_file, 'r') as f:
29 |             metrics[parent_dir] = json.load(f)
30 | 
31 |     # Check every subdirectory of parent_dir
32 |     for subdir in os.listdir(parent_dir):
33 |         if not os.path.isdir(os.path.join(parent_dir, subdir)):
34 |             continue
35 |         else:
36 |             aggregate_metrics(os.path.join(parent_dir, subdir), metrics)
37 | 
38 | 
39 | def metrics_to_table(metrics):
40 |     # Get the headers from the first subdir. Assumes everything has the same metrics
41 |     headers = metrics[list(metrics.keys())[0]].keys()
42 |     table = [[subdir] + [values[h] for h in headers] for subdir, values in metrics.items()]
43 |     res = tabulate(table, headers, tablefmt='pipe')
44 | 
45 |     return res
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     args = parser.parse_args()
50 | 
51 |     # Aggregate metrics from args.parent_dir directory
52 |     metrics = dict()
53 |     aggregate_metrics(args.parent_dir, metrics)
54 |     table = metrics_to_table(metrics)
55 | 
56 |     # Display the table to terminal
57 |     print(table)
58 | 
59 |     # Save results in parent_dir/results.md
60 |     save_file = os.path.join(args.parent_dir, "results.md")
61 |     with open(save_file, 'w') as f:
62 |         f.write(table)


--------------------------------------------------------------------------------
/pytorch/vision/train.py:
--------------------------------------------------------------------------------
  1 | """Train the model"""
  2 | 
  3 | import argparse
  4 | import logging
  5 | import os
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.optim as optim
 10 | from torch.autograd import Variable
 11 | from tqdm import tqdm
 12 | 
 13 | import utils
 14 | import model.net as net
 15 | import model.data_loader as data_loader
 16 | from evaluate import evaluate
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument('--data_dir', default='data/64x64_SIGNS',
 20 |                     help="Directory containing the dataset")
 21 | parser.add_argument('--model_dir', default='experiments/base_model',
 22 |                     help="Directory containing params.json")
 23 | parser.add_argument('--restore_file', default=None,
 24 |                     help="Optional, name of the file in --model_dir containing weights to reload before \
 25 |                     training")  # 'best' or 'train'
 26 | 
 27 | 
 28 | def train(model, optimizer, loss_fn, dataloader, metrics, params):
 29 |     """Train the model on `num_steps` batches
 30 | 
 31 |     Args:
 32 |         model: (torch.nn.Module) the neural network
 33 |         optimizer: (torch.optim) optimizer for parameters of model
 34 |         loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
 35 |         dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
 36 |         metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
 37 |         params: (Params) hyperparameters
 38 |         num_steps: (int) number of batches to train on, each of size params.batch_size
 39 |     """
 40 | 
 41 |     # set model to training mode
 42 |     model.train()
 43 | 
 44 |     # summary for current training loop and a running average object for loss
 45 |     summ = []
 46 |     loss_avg = utils.RunningAverage()
 47 | 
 48 |     # Use tqdm for progress bar
 49 |     with tqdm(total=len(dataloader)) as t:
 50 |         for i, (train_batch, labels_batch) in enumerate(dataloader):
 51 |             # move to GPU if available
 52 |             if params.cuda:
 53 |                 train_batch, labels_batch = train_batch.cuda(
 54 |                     non_blocking=True), labels_batch.cuda(non_blocking=True)
 55 |             # convert to torch Variables
 56 |             train_batch, labels_batch = Variable(
 57 |                 train_batch), Variable(labels_batch)
 58 | 
 59 |             # compute model output and loss
 60 |             output_batch = model(train_batch)
 61 |             loss = loss_fn(output_batch, labels_batch)
 62 | 
 63 |             # clear previous gradients, compute gradients of all variables wrt loss
 64 |             optimizer.zero_grad()
 65 |             loss.backward()
 66 | 
 67 |             # performs updates using calculated gradients
 68 |             optimizer.step()
 69 | 
 70 |             # Evaluate summaries only once in a while
 71 |             if i % params.save_summary_steps == 0:
 72 |                 # extract data from torch Variable, move to cpu, convert to numpy arrays
 73 |                 output_batch = output_batch.data.cpu().numpy()
 74 |                 labels_batch = labels_batch.data.cpu().numpy()
 75 | 
 76 |                 # compute all metrics on this batch
 77 |                 summary_batch = {metric: metrics[metric](output_batch, labels_batch)
 78 |                                  for metric in metrics}
 79 |                 summary_batch['loss'] = loss.item()
 80 |                 summ.append(summary_batch)
 81 | 
 82 |             # update the average loss
 83 |             loss_avg.update(loss.item())
 84 | 
 85 |             t.set_postfix(loss='{:05.3f}'.format(loss_avg()))
 86 |             t.update()
 87 | 
 88 |     # compute mean of all metrics in summary
 89 |     metrics_mean = {metric: np.mean([x[metric]
 90 |                                      for x in summ]) for metric in summ[0]}
 91 |     metrics_string = " ; ".join("{}: {:05.3f}".format(k, v)
 92 |                                 for k, v in metrics_mean.items())
 93 |     logging.info("- Train metrics: " + metrics_string)
 94 | 
 95 | 
 96 | def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir,
 97 |                        restore_file=None):
 98 |     """Train the model and evaluate every epoch.
 99 | 
100 |     Args:
101 |         model: (torch.nn.Module) the neural network
102 |         train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
103 |         val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data
104 |         optimizer: (torch.optim) optimizer for parameters of model
105 |         loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
106 |         metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
107 |         params: (Params) hyperparameters
108 |         model_dir: (string) directory containing config, weights and log
109 |         restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
110 |     """
111 |     # reload weights from restore_file if specified
112 |     if restore_file is not None:
113 |         restore_path = os.path.join(
114 |             args.model_dir, args.restore_file + '.pth.tar')
115 |         logging.info("Restoring parameters from {}".format(restore_path))
116 |         utils.load_checkpoint(restore_path, model, optimizer)
117 | 
118 |     best_val_acc = 0.0
119 | 
120 |     for epoch in range(params.num_epochs):
121 |         # Run one epoch
122 |         logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))
123 | 
124 |         # compute number of batches in one epoch (one full pass over the training set)
125 |         train(model, optimizer, loss_fn, train_dataloader, metrics, params)
126 | 
127 |         # Evaluate for one epoch on validation set
128 |         val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params)
129 | 
130 |         val_acc = val_metrics['accuracy']
131 |         is_best = val_acc >= best_val_acc
132 | 
133 |         # Save weights
134 |         utils.save_checkpoint({'epoch': epoch + 1,
135 |                                'state_dict': model.state_dict(),
136 |                                'optim_dict': optimizer.state_dict()},
137 |                               is_best=is_best,
138 |                               checkpoint=model_dir)
139 | 
140 |         # If best_eval, best_save_path
141 |         if is_best:
142 |             logging.info("- Found new best accuracy")
143 |             best_val_acc = val_acc
144 | 
145 |             # Save best val metrics in a json file in the model directory
146 |             best_json_path = os.path.join(
147 |                 model_dir, "metrics_val_best_weights.json")
148 |             utils.save_dict_to_json(val_metrics, best_json_path)
149 | 
150 |         # Save latest val metrics in a json file in the model directory
151 |         last_json_path = os.path.join(
152 |             model_dir, "metrics_val_last_weights.json")
153 |         utils.save_dict_to_json(val_metrics, last_json_path)
154 | 
155 | 
156 | if __name__ == '__main__':
157 | 
158 |     # Load the parameters from json file
159 |     args = parser.parse_args()
160 |     json_path = os.path.join(args.model_dir, 'params.json')
161 |     assert os.path.isfile(
162 |         json_path), "No json configuration file found at {}".format(json_path)
163 |     params = utils.Params(json_path)
164 | 
165 |     # use GPU if available
166 |     params.cuda = torch.cuda.is_available()
167 | 
168 |     # Set the random seed for reproducible experiments
169 |     torch.manual_seed(230)
170 |     if params.cuda:
171 |         torch.cuda.manual_seed(230)
172 | 
173 |     # Set the logger
174 |     utils.set_logger(os.path.join(args.model_dir, 'train.log'))
175 | 
176 |     # Create the input data pipeline
177 |     logging.info("Loading the datasets...")
178 | 
179 |     # fetch dataloaders
180 |     dataloaders = data_loader.fetch_dataloader(
181 |         ['train', 'val'], args.data_dir, params)
182 |     train_dl = dataloaders['train']
183 |     val_dl = dataloaders['val']
184 | 
185 |     logging.info("- done.")
186 | 
187 |     # Define the model and optimizer
188 |     model = net.Net(params).cuda() if params.cuda else net.Net(params)
189 |     optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)
190 | 
191 |     # fetch loss function and metrics
192 |     loss_fn = net.loss_fn
193 |     metrics = net.metrics
194 | 
195 |     # Train the model
196 |     logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
197 |     train_and_evaluate(model, train_dl, val_dl, optimizer, loss_fn, metrics, params, args.model_dir,
198 |                        args.restore_file)
199 | 


--------------------------------------------------------------------------------
/pytorch/vision/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import shutil
  5 | 
  6 | import torch
  7 | 
  8 | class Params():
  9 |     """Class that loads hyperparameters from a json file.
 10 | 
 11 |     Example:
 12 |     ```
 13 |     params = Params(json_path)
 14 |     print(params.learning_rate)
 15 |     params.learning_rate = 0.5  # change the value of learning_rate in params
 16 |     ```
 17 |     """
 18 | 
 19 |     def __init__(self, json_path):
 20 |         with open(json_path) as f:
 21 |             params = json.load(f)
 22 |             self.__dict__.update(params)
 23 | 
 24 |     def save(self, json_path):
 25 |         with open(json_path, 'w') as f:
 26 |             json.dump(self.__dict__, f, indent=4)
 27 |             
 28 |     def update(self, json_path):
 29 |         """Loads parameters from json file"""
 30 |         with open(json_path) as f:
 31 |             params = json.load(f)
 32 |             self.__dict__.update(params)
 33 | 
 34 |     @property
 35 |     def dict(self):
 36 |         """Gives dict-like access to Params instance by `params.dict['learning_rate']"""
 37 |         return self.__dict__
 38 | 
 39 | 
 40 | class RunningAverage():
 41 |     """A simple class that maintains the running average of a quantity
 42 |     
 43 |     Example:
 44 |     ```
 45 |     loss_avg = RunningAverage()
 46 |     loss_avg.update(2)
 47 |     loss_avg.update(4)
 48 |     loss_avg() = 3
 49 |     ```
 50 |     """
 51 |     def __init__(self):
 52 |         self.steps = 0
 53 |         self.total = 0
 54 |     
 55 |     def update(self, val):
 56 |         self.total += val
 57 |         self.steps += 1
 58 |     
 59 |     def __call__(self):
 60 |         return self.total/float(self.steps)
 61 |         
 62 |     
 63 | def set_logger(log_path):
 64 |     """Set the logger to log info in terminal and file `log_path`.
 65 | 
 66 |     In general, it is useful to have a logger so that every output to the terminal is saved
 67 |     in a permanent file. Here we save it to `model_dir/train.log`.
 68 | 
 69 |     Example:
 70 |     ```
 71 |     logging.info("Starting training...")
 72 |     ```
 73 | 
 74 |     Args:
 75 |         log_path: (string) where to log
 76 |     """
 77 |     logger = logging.getLogger()
 78 |     logger.setLevel(logging.INFO)
 79 | 
 80 |     if not logger.handlers:
 81 |         # Logging to a file
 82 |         file_handler = logging.FileHandler(log_path)
 83 |         file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
 84 |         logger.addHandler(file_handler)
 85 | 
 86 |         # Logging to console
 87 |         stream_handler = logging.StreamHandler()
 88 |         stream_handler.setFormatter(logging.Formatter('%(message)s'))
 89 |         logger.addHandler(stream_handler)
 90 | 
 91 | 
 92 | def save_dict_to_json(d, json_path):
 93 |     """Saves dict of floats in json file
 94 | 
 95 |     Args:
 96 |         d: (dict) of float-castable values (np.float, int, float, etc.)
 97 |         json_path: (string) path to json file
 98 |     """
 99 |     with open(json_path, 'w') as f:
100 |         # We need to convert the values to float for json (it doesn't accept np.array, np.float, )
101 |         d = {k: float(v) for k, v in d.items()}
102 |         json.dump(d, f, indent=4)
103 | 
104 | 
105 | def save_checkpoint(state, is_best, checkpoint):
106 |     """Saves model and training parameters at checkpoint + 'last.pth.tar'. If is_best==True, also saves
107 |     checkpoint + 'best.pth.tar'
108 | 
109 |     Args:
110 |         state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict
111 |         is_best: (bool) True if it is the best model seen till now
112 |         checkpoint: (string) folder where parameters are to be saved
113 |     """
114 |     filepath = os.path.join(checkpoint, 'last.pth.tar')
115 |     if not os.path.exists(checkpoint):
116 |         print("Checkpoint Directory does not exist! Making directory {}".format(checkpoint))
117 |         os.mkdir(checkpoint)
118 |     else:
119 |         print("Checkpoint Directory exists! ")
120 |     torch.save(state, filepath)
121 |     if is_best:
122 |         shutil.copyfile(filepath, os.path.join(checkpoint, 'best.pth.tar'))
123 | 
124 | 
125 | def load_checkpoint(checkpoint, model, optimizer=None):
126 |     """Loads model parameters (state_dict) from file_path. If optimizer is provided, loads state_dict of
127 |     optimizer assuming it is present in checkpoint.
128 | 
129 |     Args:
130 |         checkpoint: (string) filename which needs to be loaded
131 |         model: (torch.nn.Module) model for which the parameters are loaded
132 |         optimizer: (torch.optim) optional: resume optimizer from checkpoint
133 |     """
134 |     if not os.path.exists(checkpoint):
135 |         raise("File doesn't exist {}".format(checkpoint))
136 |     checkpoint = torch.load(checkpoint)
137 |     model.load_state_dict(checkpoint['state_dict'])
138 | 
139 |     if optimizer:
140 |         optimizer.load_state_dict(checkpoint['optim_dict'])
141 | 
142 |     return checkpoint


--------------------------------------------------------------------------------
/tensorflow/nlp/README.md:
--------------------------------------------------------------------------------
  1 | # Named Entity Recognition with Tensorflow
  2 | 
  3 | _Authors: Guillaume Genthial and Olivier Moindrot_
  4 | 
  5 | Take the time to read the [tutorials](https://cs230-stanford.github.io).
  6 | 
  7 | Note : all scripts must be run in `tensorflow/nlp`.
  8 | 
  9 | ## Requirements
 10 | 
 11 | We recommend using python3 and a virtual env. See instructions [here](https://cs230-stanford.github.io/project-starter-code.html).
 12 | 
 13 | ```
 14 | virtualenv -p python3 .env
 15 | source .env/bin/activate
 16 | pip install -r requirements.txt
 17 | ```
 18 | 
 19 | When you're done working on the project, deactivate the virtual environment with `deactivate`.
 20 | 
 21 | ## Task
 22 | 
 23 | Given a sentence, give a tag to each word ([Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition))
 24 | 
 25 | ```
 26 | John   lives in New   York
 27 | B-PER  O     O  B-LOC I-LOC
 28 | ```
 29 | 
 30 | ## [optional] Download the Kaggle dataset (~5 min)
 31 | 
 32 | We provide a small subset of the kaggle dataset (30 sentences) for testing in `data/small` but you are encouraged to download the original version on the [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data) website.
 33 | 
 34 | 1. **Download the dataset** `ner_dataset.csv` on [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data) and save it under the `nlp/data/kaggle` directory. Make sure you download the simple version `ner_dataset.csv` and NOT the full version `ner.csv`.
 35 | 
 36 | 2. **Build the dataset** Run the following script
 37 | 
 38 | ```
 39 | python build_kaggle_dataset.py
 40 | ```
 41 | 
 42 | It will extract the sentences and labels from the dataset, split it into train / test / dev and save it in a convenient format for our model.
 43 | 
 44 | _Debug_ If you get some errors, check that you downloaded the right file and saved it in the right directory. If you have issues with encoding, try running the script with python 2.7.
 45 | 
 46 | 3. In the next section, change `data/small` by `data/kaggle`
 47 | 
 48 | ## Quickstart (~10 min)
 49 | 
 50 | 1. **Build** vocabularies and parameters for your dataset by running
 51 | 
 52 | ```
 53 | python build_vocab.py --data_dir data/small
 54 | ```
 55 | 
 56 | It will write vocabulary files `words.txt` and `tags.txt` containing the words and tags in the dataset. It will also save a `dataset_params.json` with some extra information.
 57 | 
 58 | 2. **Your first experiment** We created a `base_model` directory for you under the `experiments` directory. It countains a file `params.json` which sets the parameters for the experiment. It looks like
 59 | 
 60 | ```json
 61 | {
 62 |   "learning_rate": 1e-3,
 63 |   "batch_size": 5,
 64 |   "num_epochs": 2
 65 | }
 66 | ```
 67 | 
 68 | For every new experiment, you will need to create a new directory under `experiments` with a `params.json` file.
 69 | 
 70 | 3. **Train** your experiment. Simply run
 71 | 
 72 | ```
 73 | python train.py --data_dir data/small --model_dir experiments/base_model
 74 | ```
 75 | 
 76 | It will instantiate a model and train it on the training set following the parameters specified in `params.json`. It will also evaluate some metrics on the development set.
 77 | 
 78 | 4. **Your first hyperparameters search** We created a new directory `learning_rate` in `experiments` for you. Now, run
 79 | 
 80 | ```
 81 | python search_hyperparams.py --data_dir data/small --parent_dir experiments/learning_rate
 82 | ```
 83 | 
 84 | It will train and evaluate a model with different values of learning rate defined in `search_hyperparams.py` and create a new directory for each experiment under `experiments/learning_rate/`.
 85 | 
 86 | 5. **Display the results** of the hyperparameters search in a nice format
 87 | 
 88 | ```
 89 | python synthesize_results.py --parent_dir experiments/learning_rate
 90 | ```
 91 | 
 92 | 6. **Evaluation on the test set** Once you've run many experiments and selected your best model and hyperparameters based on the performance on the development set, you can finally evaluate the performance of your model on the test set. Run
 93 | 
 94 | ```
 95 | python evaluate.py --data_dir data/small --model_dir experiments/base_model
 96 | ```
 97 | 
 98 | ## Guidelines for more advanced use
 99 | 
100 | We recommend reading through `train.py` to get a high-level overview of the steps:
101 | 
102 | - loading the parameters for the experiment (the `params.json`) and for the dataset (the `dataset_params.json`)
103 | - loading the vocabularies from the `words.txt` and `tags.txt` files.
104 | - creating the sentences / labels datasets (`tf.data.TextLineDataset` instances reading the files and replacing tokens by their ids)
105 | - creating the input of our model by zipping the sentences and labels together (`input_fn(...)`), as well as performing batching and padding (for sentences of different length).
106 | - creating the model (=nodes / ops of the `tf.Graph()`) by calling `model_fn(...)`
107 | - training the model for a given number of epochs by calling `train_and_evaluate(...)`
108 | 
109 | Once you get the high-level idea, depending on your dataset, you might want to modify
110 | 
111 | - `model/model_fn.py` to change the model
112 | - `model/input_fn.py` to change the way you read data / change the way you combine your different files
113 | - `train.py` and `evaluate.py` to change the story-line (maybe you need another vocabulary, etc.)
114 | 
115 | If you want to compute new metrics for which you can find a [tensorflow implementation](https://www.tensorflow.org/api_docs/python/tf/metrics), you can define it in the `model_fn.py` (add it to the `metrics` dictionnary). It will automatically be updated during the training and will be displayed at the end of each epoch.
116 | 
117 | Once you get something working for your dataset, feel free to edit any part of the code to suit your own needs.
118 | 
119 | ## Resources
120 | 
121 | Note that this repository uses Tensorflow 1.14. Tensorflow 2 has just been
122 | released, so the links below now point to Tensorflow 2 documentation. You can
123 | navigate to the old 1.14 docs through the API dropdown on navigation bar.
124 | 
125 | There are major changes between TF 1 and TF 2, most notably Eager Execution
126 | being the new default mode. If your team is starting with a new project, we
127 | recommend using Tensorflow 2.
128 | 
129 | Introduction to the `tf.data` pipeline
130 | 
131 | - [programmer's guide](https://www.tensorflow.org/programmers_guide/datasets)
132 | - [consuming text data](https://www.tensorflow.org/programmers_guide/datasets#consuming_text_data)
133 | 
134 | Tensorflow seq2seq using the tf.data pipeline:
135 | 
136 | - [documentation](https://www.tensorflow.org/tutorials/seq2seq)
137 | - [github](https://github.com/tensorflow/nmt/)
138 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/build_kaggle_dataset.py:
--------------------------------------------------------------------------------
 1 | """Read, split and save the kaggle dataset for our model"""
 2 | 
 3 | import csv
 4 | import os
 5 | import sys
 6 | 
 7 | 
 8 | def load_dataset(path_csv):
 9 |     """Loads dataset into memory from csv file"""
10 |     # Open the csv file, need to specify the encoding for python3
11 |     use_python3 = sys.version_info[0] >= 3
12 |     with (open(path_csv, encoding="windows-1252") if use_python3 else open(path_csv)) as f:
13 |         csv_file = csv.reader(f, delimiter=',')
14 |         dataset = []
15 |         words, tags = [], []
16 | 
17 |         # Each line of the csv corresponds to one word
18 |         for idx, row in enumerate(csv_file):
19 |             if idx == 0: continue
20 |             sentence, word, pos, tag = row
21 |             # If the first column is non empty it means we reached a new sentence
22 |             if len(sentence) != 0:
23 |                 if len(words) > 0:
24 |                     assert len(words) == len(tags)
25 |                     dataset.append((words, tags))
26 |                     words, tags = [], []
27 |             try:
28 |                 word, tag = str(word), str(tag)
29 |                 words.append(word)
30 |                 tags.append(tag)
31 |             except UnicodeDecodeError as e:
32 |                 print("An exception was raised, skipping a word: {}".format(e))
33 |                 pass
34 | 
35 |     return dataset
36 | 
37 | 
38 | def save_dataset(dataset, save_dir):
39 |     """Writes sentences.txt and labels.txt files in save_dir from dataset
40 | 
41 |     Args:
42 |         dataset: ([(["a", "cat"], ["O", "O"]), ...])
43 |         save_dir: (string)
44 |     """
45 |     # Create directory if it doesn't exist
46 |     print("Saving in {}...".format(save_dir))
47 |     if not os.path.exists(save_dir):
48 |         os.makedirs(save_dir)
49 | 
50 |     # Export the dataset
51 |     with open(os.path.join(save_dir, 'sentences.txt'), 'w') as file_sentences:
52 |         with open(os.path.join(save_dir, 'labels.txt'), 'w') as file_labels:
53 |             for words, tags in dataset:
54 |                 file_sentences.write("{}\n".format(" ".join(words)))
55 |                 file_labels.write("{}\n".format(" ".join(tags)))
56 |     print("- done.")
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     # Check that the dataset exists (you need to make sure you haven't downloaded the `ner.csv`)
61 |     path_dataset = 'data/kaggle/ner_dataset.csv'
62 |     msg = "{} file not found. Make sure you have downloaded the right dataset".format(path_dataset)
63 |     assert os.path.isfile(path_dataset), msg
64 | 
65 |     # Load the dataset into memory
66 |     print("Loading Kaggle dataset into memory...")
67 |     dataset = load_dataset(path_dataset)
68 |     print("- done.")
69 | 
70 |     # Split the dataset into train, dev and split (dummy split with no shuffle)
71 |     train_dataset = dataset[:int(0.7*len(dataset))]
72 |     dev_dataset = dataset[int(0.7*len(dataset)) : int(0.85*len(dataset))]
73 |     test_dataset = dataset[int(0.85*len(dataset)):]
74 | 
75 |     # Save the datasets to files
76 |     save_dataset(train_dataset, 'data/kaggle/train')
77 |     save_dataset(dev_dataset, 'data/kaggle/dev')
78 |     save_dataset(test_dataset, 'data/kaggle/test')


--------------------------------------------------------------------------------
/tensorflow/nlp/build_vocab.py:
--------------------------------------------------------------------------------
  1 | """Build vocabularies of words and tags from datasets"""
  2 | 
  3 | import argparse
  4 | from collections import Counter
  5 | import json
  6 | import os
  7 | import sys
  8 | 
  9 | 
 10 | parser = argparse.ArgumentParser()
 11 | parser.add_argument('--min_count_word', default=1, help="Minimum count for words in the dataset",
 12 |                     type=int)
 13 | parser.add_argument('--min_count_tag', default=1, help="Minimum count for tags in the dataset",
 14 |                     type=int)
 15 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset")
 16 | 
 17 | # Hyper parameters for the vocab
 18 | NUM_OOV_BUCKETS = 1 # number of buckets (= number of ids) for unknown words
 19 | PAD_WORD = '<pad>'
 20 | PAD_TAG = 'O'
 21 | 
 22 | 
 23 | def save_vocab_to_txt_file(vocab, txt_path):
 24 |     """Writes one token per line, 0-based line id corresponds to the id of the token.
 25 | 
 26 |     Args:
 27 |         vocab: (iterable object) yields token
 28 |         txt_path: (stirng) path to vocab file
 29 |     """
 30 |     with open(txt_path, "w") as f:
 31 |         f.write("\n".join(token for token in vocab))
 32 | 
 33 | 
 34 | def save_dict_to_json(d, json_path):
 35 |     """Saves dict to json file
 36 | 
 37 |     Args:
 38 |         d: (dict)
 39 |         json_path: (string) path to json file
 40 |     """
 41 |     with open(json_path, 'w') as f:
 42 |         d = {k: v for k, v in d.items()}
 43 |         json.dump(d, f, indent=4)
 44 | 
 45 | 
 46 | def update_vocab(txt_path, vocab):
 47 |     """Update word and tag vocabulary from dataset
 48 | 
 49 |     Args:
 50 |         txt_path: (string) path to file, one sentence per line
 51 |         vocab: (dict or Counter) with update method
 52 | 
 53 |     Returns:
 54 |         dataset_size: (int) number of elements in the dataset
 55 |     """
 56 |     with open(txt_path) as f:
 57 |         for i, line in enumerate(f):
 58 |             vocab.update(line.strip().split(' '))
 59 | 
 60 | 
 61 |     return i + 1
 62 | 
 63 | 
 64 | if __name__ == '__main__':
 65 |     args = parser.parse_args()
 66 | 
 67 |     # Build word vocab with train and test datasets
 68 |     print("Building word vocabulary...")
 69 |     words = Counter()
 70 |     size_train_sentences = update_vocab(os.path.join(args.data_dir, 'train/sentences.txt'), words)
 71 |     size_dev_sentences = update_vocab(os.path.join(args.data_dir, 'dev/sentences.txt'), words)
 72 |     size_test_sentences = update_vocab(os.path.join(args.data_dir, 'test/sentences.txt'), words)
 73 |     print("- done.")
 74 | 
 75 |     # Build tag vocab with train and test datasets
 76 |     print("Building tag vocabulary...")
 77 |     tags = Counter()
 78 |     size_train_tags = update_vocab(os.path.join(args.data_dir, 'train/labels.txt'), tags)
 79 |     size_dev_tags = update_vocab(os.path.join(args.data_dir, 'dev/labels.txt'), tags)
 80 |     size_test_tags = update_vocab(os.path.join(args.data_dir, 'test/labels.txt'), tags)
 81 |     print("- done.")
 82 | 
 83 |     # Assert same number of examples in datasets
 84 |     assert size_train_sentences == size_train_tags
 85 |     assert size_dev_sentences == size_dev_tags
 86 |     assert size_test_sentences == size_test_tags
 87 | 
 88 |     # Only keep most frequent tokens
 89 |     words = [tok for tok, count in words.items() if count >= args.min_count_word]
 90 |     tags = [tok for tok, count in tags.items() if count >= args.min_count_tag]
 91 | 
 92 |     # Add pad tokens
 93 |     if PAD_WORD not in words: words.append(PAD_WORD)
 94 |     if PAD_TAG not in tags: tags.append(PAD_TAG)
 95 | 
 96 |     # Save vocabularies to file
 97 |     print("Saving vocabularies to file...")
 98 |     save_vocab_to_txt_file(words, os.path.join(args.data_dir, 'words.txt'))
 99 |     save_vocab_to_txt_file(tags, os.path.join(args.data_dir, 'tags.txt'))
100 |     print("- done.")
101 | 
102 |     # Save datasets properties in json file
103 |     sizes = {
104 |         'train_size': size_train_sentences,
105 |         'dev_size': size_dev_sentences,
106 |         'test_size': size_test_sentences,
107 |         'vocab_size': len(words) + NUM_OOV_BUCKETS,
108 |         'number_of_tags': len(tags),
109 |         'pad_word': PAD_WORD,
110 |         'pad_tag': PAD_TAG,
111 |         'num_oov_buckets': NUM_OOV_BUCKETS
112 |     }
113 |     save_dict_to_json(sizes, os.path.join(args.data_dir, 'dataset_params.json'))
114 | 
115 |     # Logging sizes
116 |     to_print = "\n".join("- {}: {}".format(k, v) for k, v in sizes.items())
117 |     print("Characteristics of the dataset:\n{}".format(to_print))


--------------------------------------------------------------------------------
/tensorflow/nlp/data/kaggle/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/nlp/data/kaggle/.gitkeep


--------------------------------------------------------------------------------
/tensorflow/nlp/data/small/dev/labels.txt:
--------------------------------------------------------------------------------
 1 | B-PER O O B-LOC I-LOC
 2 | B-PER O O B-LOC
 3 | B-PER O O B-LOC
 4 | B-PER O O B-LOC
 5 | B-PER O O B-LOC
 6 | B-PER O O B-LOC
 7 | B-PER O O B-LOC
 8 | B-PER O O B-LOC
 9 | B-PER O O B-LOC
10 | B-PER O O B-LOC


--------------------------------------------------------------------------------
/tensorflow/nlp/data/small/dev/sentences.txt:
--------------------------------------------------------------------------------
 1 | John lives in New York
 2 | Kate lives in London
 3 | Ziang lives in Beijing
 4 | Pierre lives in Paris
 5 | Dominik lives in Berlin
 6 | Raul lives in Mexico
 7 | Sergio lives in Rome
 8 | Alexandr lives in Moscow
 9 | Ines lives in Casablanca
10 | Jack lives in San Francisco


--------------------------------------------------------------------------------
/tensorflow/nlp/data/small/test/labels.txt:
--------------------------------------------------------------------------------
 1 | B-PER O O B-LOC I-LOC
 2 | B-PER O O B-LOC
 3 | B-PER O O B-LOC
 4 | B-PER O O B-LOC
 5 | B-PER O O B-LOC
 6 | B-PER O O B-LOC
 7 | B-PER O O B-LOC
 8 | B-PER O O B-LOC
 9 | B-PER O O B-LOC
10 | B-PER O O B-LOC


--------------------------------------------------------------------------------
/tensorflow/nlp/data/small/test/sentences.txt:
--------------------------------------------------------------------------------
 1 | John lives in New York
 2 | Kate lives in London
 3 | Ziang lives in Beijing
 4 | Pierre lives in Paris
 5 | Dominik lives in Berlin
 6 | Raul lives in Mexico
 7 | Sergio lives in Rome
 8 | Alexandr lives in Moscow
 9 | Ines lives in Casablanca
10 | Jack lives in San Francisco


--------------------------------------------------------------------------------
/tensorflow/nlp/data/small/train/labels.txt:
--------------------------------------------------------------------------------
 1 | B-PER O O B-LOC I-LOC
 2 | B-PER O O B-LOC
 3 | B-PER O O B-LOC
 4 | B-PER O O B-LOC
 5 | B-PER O O B-LOC
 6 | B-PER O O B-LOC
 7 | B-PER O O B-LOC
 8 | B-PER O O B-LOC
 9 | B-PER O O B-LOC
10 | B-PER O O B-LOC


--------------------------------------------------------------------------------
/tensorflow/nlp/data/small/train/sentences.txt:
--------------------------------------------------------------------------------
 1 | John lives in New York
 2 | Kate lives in London
 3 | Ziang lives in Beijing
 4 | Pierre lives in Paris
 5 | Dominik lives in Berlin
 6 | Raul lives in Mexico
 7 | Sergio lives in Rome
 8 | Alexandr lives in Moscow
 9 | Ines lives in Casablanca
10 | Jack lives in San Francisco


--------------------------------------------------------------------------------
/tensorflow/nlp/evaluate.py:
--------------------------------------------------------------------------------
 1 | """Evaluate the model"""
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | 
 7 | import numpy as np
 8 | import tensorflow as tf
 9 | 
10 | from model.utils import Params
11 | from model.utils import set_logger
12 | from model.evaluation import evaluate
13 | from model.input_fn import input_fn
14 | from model.input_fn import load_dataset_from_text
15 | from model.model_fn import model_fn
16 | 
17 | 
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('--model_dir', default='experiments/base_model',
20 |                     help="Directory containing params.json")
21 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset")
22 | parser.add_argument('--restore_from', default='best_weights',
23 |                     help="Subdirectory of model dir or file containing the weights")
24 | 
25 | if __name__ == '__main__':
26 |     # Set the random seed for the whole graph
27 |     tf.set_random_seed(230)
28 | 
29 |     # Load the parameters
30 |     args = parser.parse_args()
31 |     json_path = os.path.join(args.model_dir, 'params.json')
32 |     assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
33 |     params = Params(json_path)
34 | 
35 |     # Load the parameters from the dataset, that gives the size etc. into params
36 |     json_path = os.path.join(args.data_dir, 'dataset_params.json')
37 |     assert os.path.isfile(json_path), "No json file found at {}, run build.py".format(json_path)
38 |     params.update(json_path)
39 |     num_oov_buckets = params.num_oov_buckets # number of buckets for unknown words
40 | 
41 |     # Set the logger
42 |     set_logger(os.path.join(args.model_dir, 'evaluate.log'))
43 | 
44 |     # Get paths for vocabularies and dataset
45 |     path_words = os.path.join(args.data_dir, 'words.txt')
46 |     path_tags = os.path.join(args.data_dir, 'tags.txt')
47 |     path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt')
48 |     path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt')
49 | 
50 |     # Load Vocabularies
51 |     words = tf.contrib.lookup.index_table_from_file(path_words, num_oov_buckets=num_oov_buckets)
52 |     tags = tf.contrib.lookup.index_table_from_file(path_tags)
53 | 
54 |     # Create the input data pipeline
55 |     logging.info("Creating the dataset...")
56 |     test_sentences = load_dataset_from_text(path_eval_sentences, words)
57 |     test_labels = load_dataset_from_text(path_eval_labels, tags)
58 | 
59 |     # Specify other parameters for the dataset and the model
60 |     params.eval_size = params.test_size
61 |     params.id_pad_word = words.lookup(tf.constant(params.pad_word))
62 |     params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag))
63 | 
64 |     # Create iterator over the test set
65 |     inputs = input_fn('eval', test_sentences, test_labels, params)
66 |     logging.info("- done.")
67 | 
68 |     # Define the model
69 |     logging.info("Creating the model...")
70 |     model_spec = model_fn('eval', inputs, params, reuse=False)
71 |     logging.info("- done.")
72 | 
73 |     logging.info("Starting evaluation")
74 |     evaluate(model_spec, args.model_dir, params, args.restore_from)
75 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/experiments/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/nlp/experiments/.gitkeep


--------------------------------------------------------------------------------
/tensorflow/nlp/experiments/base_model/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_version": "lstm",
 3 |     "lstm_num_units": 50,
 4 |     "embedding_size": 50,
 5 | 
 6 |     "learning_rate": 1e-3,
 7 |     "batch_size": 32,
 8 |     "num_epochs": 10,
 9 |     "dropout_rate": 0.3,
10 | 
11 |     "save_summary_steps": 100
12 | }
13 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/experiments/learning_rate/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model_version": "lstm",
 3 |     "lstm_num_units": 50,
 4 |     "embedding_size": 50,
 5 | 
 6 |     "learning_rate": 1e-3,
 7 |     "batch_size": 32,
 8 |     "num_epochs": 2,
 9 |     "dropout_rate": 0.3,
10 | 
11 |     "save_summary_steps": 100
12 | }
13 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/nlp/model/__init__.py


--------------------------------------------------------------------------------
/tensorflow/nlp/model/evaluation.py:
--------------------------------------------------------------------------------
 1 | """Tensorflow utility functions for evaluation"""
 2 | 
 3 | import logging
 4 | import os
 5 | 
 6 | from tqdm import trange
 7 | import tensorflow as tf
 8 | 
 9 | from model.utils import save_dict_to_json
10 | 
11 | 
12 | def evaluate_sess(sess, model_spec, num_steps, writer=None, params=None):
13 |     """Train the model on `num_steps` batches.
14 | 
15 |     Args:
16 |         sess: (tf.Session) current session
17 |         model_spec: (dict) contains the graph operations or nodes needed for training
18 |         num_steps: (int) train for this number of batches
19 |         writer: (tf.summary.FileWriter) writer for summaries. Is None if we don't log anything
20 |         params: (Params) hyperparameters
21 |     """
22 |     update_metrics = model_spec['update_metrics']
23 |     eval_metrics = model_spec['metrics']
24 |     global_step = tf.train.get_global_step()
25 | 
26 |     # Load the evaluation dataset into the pipeline and initialize the metrics init op
27 |     sess.run(model_spec['iterator_init_op'])
28 |     sess.run(model_spec['metrics_init_op'])
29 | 
30 |     # compute metrics over the dataset
31 |     for _ in range(num_steps):
32 |         sess.run(update_metrics)
33 | 
34 |     # Get the values of the metrics
35 |     metrics_values = {k: v[0] for k, v in eval_metrics.items()}
36 |     metrics_val = sess.run(metrics_values)
37 |     metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_val.items())
38 |     logging.info("- Eval metrics: " + metrics_string)
39 | 
40 |     # Add summaries manually to writer at global_step_val
41 |     if writer is not None:
42 |         global_step_val = sess.run(global_step)
43 |         for tag, val in metrics_val.items():
44 |             summ = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)])
45 |             writer.add_summary(summ, global_step_val)
46 | 
47 |     return metrics_val
48 | 
49 | 
50 | def evaluate(model_spec, model_dir, params, restore_from):
51 |     """Evaluate the model
52 | 
53 |     Args:
54 |         model_spec: (dict) contains the graph operations or nodes needed for evaluation
55 |         model_dir: (string) directory containing config, weights and log
56 |         params: (Params) contains hyperparameters of the model.
57 |                 Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
58 |         restore_from: (string) directory or file containing weights to restore the graph
59 |     """
60 |     # Initialize tf.Saver
61 |     saver = tf.train.Saver()
62 | 
63 |     with tf.Session() as sess:
64 |         # Initialize the lookup table
65 |         sess.run(model_spec['variable_init_op'])
66 | 
67 |         # Reload weights from the weights subdirectory
68 |         save_path = os.path.join(model_dir, restore_from)
69 |         if os.path.isdir(save_path):
70 |             save_path = tf.train.latest_checkpoint(save_path)
71 |         saver.restore(sess, save_path)
72 | 
73 |         # Evaluate
74 |         num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size
75 |         metrics = evaluate_sess(sess, model_spec, num_steps)
76 |         metrics_name = '_'.join(restore_from.split('/'))
77 |         save_path = os.path.join(model_dir, "metrics_test_{}.json".format(metrics_name))
78 |         save_dict_to_json(metrics, save_path)
79 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/model/input_fn.py:
--------------------------------------------------------------------------------
 1 | """Create the input data pipeline using `tf.data`"""
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def load_dataset_from_text(path_txt, vocab):
 7 |     """Create tf.data Instance from txt file
 8 | 
 9 |     Args:
10 |         path_txt: (string) path containing one example per line
11 |         vocab: (tf.lookuptable)
12 | 
13 |     Returns:
14 |         dataset: (tf.Dataset) yielding list of ids of tokens for each example
15 |     """
16 |     # Load txt file, one example per line
17 |     dataset = tf.data.TextLineDataset(path_txt)
18 | 
19 |     # Convert line into list of tokens, splitting by white space
20 |     dataset = dataset.map(lambda string: tf.string_split([string]).values)
21 | 
22 |     # Lookup tokens to return their ids
23 |     dataset = dataset.map(lambda tokens: (vocab.lookup(tokens), tf.size(tokens)))
24 | 
25 |     return dataset
26 | 
27 | 
28 | def input_fn(mode, sentences, labels, params):
29 |     """Input function for NER
30 | 
31 |     Args:
32 |         mode: (string) 'train', 'eval' or any other mode you can think of
33 |                      At training, we shuffle the data and have multiple epochs
34 |         sentences: (tf.Dataset) yielding list of ids of words
35 |         datasets: (tf.Dataset) yielding list of ids of tags
36 |         params: (Params) contains hyperparameters of the model (ex: `params.num_epochs`)
37 | 
38 |     """
39 |     # Load all the dataset in memory for shuffling is training
40 |     is_training = (mode == 'train')
41 |     buffer_size = params.buffer_size if is_training else 1
42 | 
43 |     # Zip the sentence and the labels together
44 |     dataset = tf.data.Dataset.zip((sentences, labels))
45 | 
46 |     # Create batches and pad the sentences of different length
47 |     padded_shapes = ((tf.TensorShape([None]),  # sentence of unknown size
48 |                       tf.TensorShape([])),     # size(words)
49 |                      (tf.TensorShape([None]),  # labels of unknown size
50 |                       tf.TensorShape([])))     # size(tags)
51 | 
52 |     padding_values = ((params.id_pad_word,   # sentence padded on the right with id_pad_word
53 |                        0),                   # size(words) -- unused
54 |                       (params.id_pad_tag,    # labels padded on the right with id_pad_tag
55 |                        0))                   # size(tags) -- unused
56 | 
57 | 
58 |     dataset = (dataset
59 |         .shuffle(buffer_size=buffer_size)
60 |         .padded_batch(params.batch_size, padded_shapes=padded_shapes, padding_values=padding_values)
61 |         .prefetch(1)  # make sure you always have one batch ready to serve
62 |     )
63 | 
64 |     # Create initializable iterator from this dataset so that we can reset at each epoch
65 |     iterator = dataset.make_initializable_iterator()
66 | 
67 |     # Query the output of the iterator for input to the model
68 |     ((sentence, sentence_lengths), (labels, _)) = iterator.get_next()
69 |     init_op = iterator.initializer
70 | 
71 |     # Build and return a dictionnary containing the nodes / ops
72 |     inputs = {
73 |         'sentence': sentence,
74 |         'labels': labels,
75 |         'sentence_lengths': sentence_lengths,
76 |         'iterator_init_op': init_op
77 |     }
78 | 
79 |     return inputs
80 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/model/model_fn.py:
--------------------------------------------------------------------------------
  1 | """Define the model."""
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | def build_model(mode, inputs, params):
  7 |     """Compute logits of the model (output distribution)
  8 | 
  9 |     Args:
 10 |         mode: (string) 'train', 'eval', etc.
 11 |         inputs: (dict) contains the inputs of the graph (features, labels...)
 12 |                 this can be `tf.placeholder` or outputs of `tf.data`
 13 |         params: (Params) contains hyperparameters of the model (ex: `params.learning_rate`)
 14 | 
 15 |     Returns:
 16 |         output: (tf.Tensor) output of the model
 17 |     """
 18 |     sentence = inputs['sentence']
 19 | 
 20 |     if params.model_version == 'lstm':
 21 |         # Get word embeddings for each token in the sentence
 22 |         embeddings = tf.get_variable(name="embeddings", dtype=tf.float32,
 23 |                 shape=[params.vocab_size, params.embedding_size])
 24 |         sentence = tf.nn.embedding_lookup(embeddings, sentence)
 25 | 
 26 |         # Apply LSTM over the embeddings
 27 |         lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(params.lstm_num_units)
 28 |         output, _  = tf.nn.dynamic_rnn(lstm_cell, sentence, dtype=tf.float32)
 29 | 
 30 |         # Compute logits from the output of the LSTM
 31 |         logits = tf.layers.dense(output, params.number_of_tags)
 32 | 
 33 |     else:
 34 |         raise NotImplementedError("Unknown model version: {}".format(params.model_version))
 35 | 
 36 |     return logits
 37 | 
 38 | 
 39 | def model_fn(mode, inputs, params, reuse=False):
 40 |     """Model function defining the graph operations.
 41 | 
 42 |     Args:
 43 |         mode: (string) 'train', 'eval', etc.
 44 |         inputs: (dict) contains the inputs of the graph (features, labels...)
 45 |                 this can be `tf.placeholder` or outputs of `tf.data`
 46 |         params: (Params) contains hyperparameters of the model (ex: `params.learning_rate`)
 47 |         reuse: (bool) whether to reuse the weights
 48 | 
 49 |     Returns:
 50 |         model_spec: (dict) contains the graph operations or nodes needed for training / evaluation
 51 |     """
 52 |     is_training = (mode == 'train')
 53 |     labels = inputs['labels']
 54 |     sentence_lengths = inputs['sentence_lengths']
 55 | 
 56 |     # -----------------------------------------------------------
 57 |     # MODEL: define the layers of the model
 58 |     with tf.variable_scope('model', reuse=reuse):
 59 |         # Compute the output distribution of the model and the predictions
 60 |         logits = build_model(mode, inputs, params)
 61 |         predictions = tf.argmax(logits, -1)
 62 | 
 63 |     # Define loss and accuracy (we need to apply a mask to account for padding)
 64 |     losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
 65 |     mask = tf.sequence_mask(sentence_lengths)
 66 |     losses = tf.boolean_mask(losses, mask)
 67 |     loss = tf.reduce_mean(losses)
 68 |     accuracy = tf.reduce_mean(tf.cast(tf.equal(labels, predictions), tf.float32))
 69 | 
 70 |     # Define training step that minimizes the loss with the Adam optimizer
 71 |     if is_training:
 72 |         optimizer = tf.train.AdamOptimizer(params.learning_rate)
 73 |         global_step = tf.train.get_or_create_global_step()
 74 |         train_op = optimizer.minimize(loss, global_step=global_step)
 75 | 
 76 |     # -----------------------------------------------------------
 77 |     # METRICS AND SUMMARIES
 78 |     # Metrics for evaluation using tf.metrics (average over whole dataset)
 79 |     with tf.variable_scope("metrics"):
 80 |         metrics = {
 81 |             'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions),
 82 |             'loss': tf.metrics.mean(loss)
 83 |         }
 84 | 
 85 |     # Group the update ops for the tf.metrics
 86 |     update_metrics_op = tf.group(*[op for _, op in metrics.values()])
 87 | 
 88 |     # Get the op to reset the local variables used in tf.metrics
 89 |     metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics")
 90 |     metrics_init_op = tf.variables_initializer(metric_variables)
 91 | 
 92 |     # Summaries for training
 93 |     tf.summary.scalar('loss', loss)
 94 |     tf.summary.scalar('accuracy', accuracy)
 95 | 
 96 |     # -----------------------------------------------------------
 97 |     # MODEL SPECIFICATION
 98 |     # Create the model specification and return it
 99 |     # It contains nodes or operations in the graph that will be used for training and evaluation
100 |     model_spec = inputs
101 |     variable_init_op = tf.group(*[tf.global_variables_initializer(), tf.tables_initializer()])
102 |     model_spec['variable_init_op'] = variable_init_op
103 |     model_spec["predictions"] = predictions
104 |     model_spec['loss'] = loss
105 |     model_spec['accuracy'] = accuracy
106 |     model_spec['metrics_init_op'] = metrics_init_op
107 |     model_spec['metrics'] = metrics
108 |     model_spec['update_metrics'] = update_metrics_op
109 |     model_spec['summary_op'] = tf.summary.merge_all()
110 | 
111 |     if is_training:
112 |         model_spec['train_op'] = train_op
113 | 
114 |     return model_spec
115 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/model/training.py:
--------------------------------------------------------------------------------
  1 | """Tensorflow utility functions for training"""
  2 | 
  3 | import logging
  4 | import os
  5 | 
  6 | from tqdm import trange
  7 | import tensorflow as tf
  8 | 
  9 | from model.utils import save_dict_to_json
 10 | from model.evaluation import evaluate_sess
 11 | 
 12 | 
 13 | def train_sess(sess, model_spec, num_steps, writer, params):
 14 |     """Train the model on `num_steps` batches
 15 | 
 16 |     Args:
 17 |         sess: (tf.Session) current session
 18 |         model_spec: (dict) contains the graph operations or nodes needed for training
 19 |         num_steps: (int) train for this number of batches
 20 |         writer: (tf.summary.FileWriter) writer for summaries
 21 |         params: (Params) hyperparameters
 22 |     """
 23 |     # Get relevant graph operations or nodes needed for training
 24 |     loss = model_spec['loss']
 25 |     train_op = model_spec['train_op']
 26 |     update_metrics = model_spec['update_metrics']
 27 |     metrics = model_spec['metrics']
 28 |     summary_op = model_spec['summary_op']
 29 |     global_step = tf.train.get_global_step()
 30 | 
 31 |     # Load the training dataset into the pipeline and initialize the metrics local variables
 32 |     sess.run(model_spec['iterator_init_op'])
 33 |     sess.run(model_spec['metrics_init_op'])
 34 | 
 35 |     # Use tqdm for progress bar
 36 |     t = trange(num_steps)
 37 |     for i in t:
 38 |         # Evaluate summaries for tensorboard only once in a while
 39 |         if i % params.save_summary_steps == 0:
 40 |             # Perform a mini-batch update
 41 |             _, _, loss_val, summ, global_step_val = sess.run([train_op, update_metrics, loss,
 42 |                                                               summary_op, global_step])
 43 |             # Write summaries for tensorboard
 44 |             writer.add_summary(summ, global_step_val)
 45 |         else:
 46 |             _, _, loss_val = sess.run([train_op, update_metrics, loss])
 47 |         # Log the loss in the tqdm progress bar
 48 |         t.set_postfix(loss='{:05.3f}'.format(loss_val))
 49 | 
 50 | 
 51 |     metrics_values = {k: v[0] for k, v in metrics.items()}
 52 |     metrics_val = sess.run(metrics_values)
 53 |     metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_val.items())
 54 |     logging.info("- Train metrics: " + metrics_string)
 55 | 
 56 | 
 57 | def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, restore_from=None):
 58 |     """Train the model and evaluate every epoch.
 59 | 
 60 |     Args:
 61 |         train_model_spec: (dict) contains the graph operations or nodes needed for training
 62 |         eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation
 63 |         model_dir: (string) directory containing config, weights and log
 64 |         params: (Params) contains hyperparameters of the model.
 65 |                 Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
 66 |         restore_from: (string) directory or file containing weights to restore the graph
 67 |     """
 68 |     # Initialize tf.Saver instances to save weights during training
 69 |     last_saver = tf.train.Saver() # will keep last 5 epochs
 70 |     best_saver = tf.train.Saver(max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
 71 |     begin_at_epoch = 0
 72 | 
 73 |     with tf.Session() as sess:
 74 |         # Initialize model variables
 75 |         sess.run(train_model_spec['variable_init_op'])
 76 | 
 77 |         # Reload weights from directory if specified
 78 |         if restore_from is not None:
 79 |             logging.info("Restoring parameters from {}".format(restore_from))
 80 |             if os.path.isdir(restore_from):
 81 |                 restore_from = tf.train.latest_checkpoint(restore_from)
 82 |                 begin_at_epoch = int(restore_from.split('-')[-1])
 83 |             last_saver.restore(sess, restore_from)
 84 | 
 85 |         # For tensorboard (takes care of writing summaries to files)
 86 |         train_writer = tf.summary.FileWriter(os.path.join(model_dir, 'train_summaries'), sess.graph)
 87 |         eval_writer = tf.summary.FileWriter(os.path.join(model_dir, 'eval_summaries'), sess.graph)
 88 | 
 89 |         best_eval_acc = 0.0
 90 |         for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
 91 |             # Run one epoch
 92 |             logging.info("Epoch {}/{}".format(epoch + 1, begin_at_epoch + params.num_epochs))
 93 |             # Compute number of batches in one epoch (one full pass over the training set)
 94 |             num_steps = (params.train_size + params.batch_size - 1) // params.batch_size
 95 |             train_sess(sess, train_model_spec, num_steps, train_writer, params)
 96 | 
 97 |             # Save weights
 98 |             last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch')
 99 |             last_saver.save(sess, last_save_path, global_step=epoch + 1)
100 | 
101 |             # Evaluate for one epoch on validation set
102 |             num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size
103 |             metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer)
104 | 
105 |             # If best_eval, best_save_path
106 |             eval_acc = metrics['accuracy']
107 |             if eval_acc >= best_eval_acc:
108 |                 # Store new best accuracy
109 |                 best_eval_acc = eval_acc
110 |                 # Save weights
111 |                 best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch')
112 |                 best_save_path = best_saver.save(sess, best_save_path, global_step=epoch + 1)
113 |                 logging.info("- Found new best accuracy, saving in {}".format(best_save_path))
114 |                 # Save best eval metrics in a json file in the model directory
115 |                 best_json_path = os.path.join(model_dir, "metrics_eval_best_weights.json")
116 |                 save_dict_to_json(metrics, best_json_path)
117 | 
118 |             # Save latest eval metrics in a json file in the model directory
119 |             last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json")
120 |             save_dict_to_json(metrics, last_json_path)
121 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/model/utils.py:
--------------------------------------------------------------------------------
 1 | """General utility functions"""
 2 | 
 3 | import json
 4 | import logging
 5 | 
 6 | 
 7 | class Params():
 8 |     """Class that loads hyperparameters from a json file.
 9 | 
10 |     Example:
11 |     ```
12 |     params = Params(json_path)
13 |     print(params.learning_rate)
14 |     params.learning_rate = 0.5  # change the value of learning_rate in params
15 |     ```
16 |     """
17 | 
18 |     def __init__(self, json_path):
19 |         self.update(json_path)
20 | 
21 |     def save(self, json_path):
22 |         """Saves parameters to json file"""
23 |         with open(json_path, 'w') as f:
24 |             json.dump(self.__dict__, f, indent=4)
25 | 
26 |     def update(self, json_path):
27 |         """Loads parameters from json file"""
28 |         with open(json_path) as f:
29 |             params = json.load(f)
30 |             self.__dict__.update(params)
31 | 
32 |     @property
33 |     def dict(self):
34 |         """Gives dict-like access to Params instance by `params.dict['learning_rate']`"""
35 |         return self.__dict__
36 | 
37 | 
38 | def set_logger(log_path):
39 |     """Sets the logger to log info in terminal and file `log_path`.
40 | 
41 |     In general, it is useful to have a logger so that every output to the terminal is saved
42 |     in a permanent file. Here we save it to `model_dir/train.log`.
43 | 
44 |     Example:
45 |     ```
46 |     logging.info("Starting training...")
47 |     ```
48 | 
49 |     Args:
50 |         log_path: (string) where to log
51 |     """
52 |     logger = logging.getLogger()
53 |     logger.setLevel(logging.INFO)
54 | 
55 |     if not logger.handlers:
56 |         # Logging to a file
57 |         file_handler = logging.FileHandler(log_path)
58 |         file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
59 |         logger.addHandler(file_handler)
60 | 
61 |         # Logging to console
62 |         stream_handler = logging.StreamHandler()
63 |         stream_handler.setFormatter(logging.Formatter('%(message)s'))
64 |         logger.addHandler(stream_handler)
65 | 
66 | 
67 | def save_dict_to_json(d, json_path):
68 |     """Saves dict of floats in json file
69 | 
70 |     Args:
71 |         d: (dict) of float-castable values (np.float, int, float, etc.)
72 |         json_path: (string) path to json file
73 |     """
74 |     with open(json_path, 'w') as f:
75 |         # We need to convert the values to float for json (it doesn't accept np.array, np.float, )
76 |         d = {k: float(v) for k, v in d.items()}
77 |         json.dump(d, f, indent=4)
78 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.15.0
2 | tabulate
3 | tqdm
4 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/search_hyperparams.py:
--------------------------------------------------------------------------------
 1 | """Peform hyperparemeters search"""
 2 | 
 3 | import argparse
 4 | import os
 5 | from subprocess import check_call
 6 | import sys
 7 | 
 8 | from model.utils import Params
 9 | 
10 | 
11 | PYTHON = sys.executable
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--parent_dir', default='experiments/learning_rate',
14 |                     help="Directory containing params.json")
15 | parser.add_argument('--data_dir', default='data/small',
16 |                     help="Directory containing the dataset")
17 | 
18 | 
19 | def launch_training_job(parent_dir, data_dir, job_name, params):
20 |     """Launch training of the model with a set of hyperparameters in parent_dir/job_name
21 | 
22 |     Args:
23 |         parent_dir: (string) directory containing config, weights and log
24 |         data_dir: (string) directory containing the dataset
25 |         params: (dict) containing hyperparameters
26 |     """
27 |     # Create a new folder in parent_dir with unique_name "job_name"
28 |     model_dir = os.path.join(parent_dir, job_name)
29 |     if not os.path.exists(model_dir):
30 |         os.makedirs(model_dir)
31 | 
32 |     # Write parameters in json file
33 |     json_path = os.path.join(model_dir, 'params.json')
34 |     params.save(json_path)
35 | 
36 |     # Launch training with this config
37 |     cmd = "{python} train.py --model_dir {model_dir} --data_dir {data_dir}".format(python=PYTHON,
38 |             model_dir=model_dir, data_dir=data_dir)
39 |     print(cmd)
40 |     check_call(cmd, shell=True)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     # Load the "reference" parameters from parent_dir json file
45 |     args = parser.parse_args()
46 |     json_path = os.path.join(args.parent_dir, 'params.json')
47 |     assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
48 |     params = Params(json_path)
49 | 
50 |     # Perform hypersearch over one parameter
51 |     learning_rates = [1e-4, 1e-3, 1e-2]
52 | 
53 |     for learning_rate in learning_rates:
54 |         # Modify the relevant parameter in params
55 |         params.learning_rate = learning_rate
56 | 
57 |         # Launch job (name has to be unique)
58 |         job_name = "learning_rate_{}".format(learning_rate)
59 |         launch_training_job(args.parent_dir, args.data_dir, job_name, params)
60 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/synthesize_results.py:
--------------------------------------------------------------------------------
 1 | """Aggregates results from the metrics_eval_best_weights.json in a parent folder"""
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | 
 7 | from tabulate import tabulate
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--parent_dir', default='experiments',
12 |                     help='Directory containing results of experiments')
13 | 
14 | 
15 | def aggregate_metrics(parent_dir, metrics):
16 |     """Aggregate the metrics of all experiments in folder `parent_dir`.
17 | 
18 |     Assumes that `parent_dir` contains multiple experiments, with their results stored in
19 |     `parent_dir/subdir/metrics_dev.json`
20 | 
21 |     Args:
22 |         parent_dir: (string) path to directory containing experiments results
23 |         metrics: (dict) subdir -> {'accuracy': ..., ...}
24 |     """
25 |     # Get the metrics for the folder if it has results from an experiment
26 |     metrics_file = os.path.join(parent_dir, 'metrics_eval_best_weights.json')
27 |     if os.path.isfile(metrics_file):
28 |         with open(metrics_file, 'r') as f:
29 |             metrics[parent_dir] = json.load(f)
30 | 
31 |     # Check every subdirectory of parent_dir
32 |     for subdir in os.listdir(parent_dir):
33 |         if not os.path.isdir(os.path.join(parent_dir, subdir)):
34 |             continue
35 |         else:
36 |             aggregate_metrics(os.path.join(parent_dir, subdir), metrics)
37 | 
38 | 
39 | def metrics_to_table(metrics):
40 |     # Get the headers from the first subdir. Assumes everything has the same metrics
41 |     headers = metrics[list(metrics.keys())[0]].keys()
42 |     table = [[subdir] + [values[h] for h in headers] for subdir, values in metrics.items()]
43 |     res = tabulate(table, headers, tablefmt='pipe')
44 | 
45 |     return res
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     args = parser.parse_args()
50 | 
51 |     # Aggregate metrics from args.parent_dir directory
52 |     metrics = dict()
53 |     aggregate_metrics(args.parent_dir, metrics)
54 |     table = metrics_to_table(metrics)
55 | 
56 |     # Display the table to terminal
57 |     print(table)
58 | 
59 |     # Save results in parent_dir/results.md
60 |     save_file = os.path.join(args.parent_dir, "results.md")
61 |     with open(save_file, 'w') as f:
62 |         f.write(table)
63 | 


--------------------------------------------------------------------------------
/tensorflow/nlp/train.py:
--------------------------------------------------------------------------------
 1 | """Train the model"""
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from model.utils import Params
10 | from model.utils import set_logger
11 | from model.training import train_and_evaluate
12 | from model.input_fn import input_fn
13 | from model.input_fn import load_dataset_from_text
14 | from model.model_fn import model_fn
15 | 
16 | 
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--model_dir', default='experiments/base_model',
19 |                     help="Directory containing params.json")
20 | parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset")
21 | parser.add_argument('--restore_dir', default=None,
22 |                     help="Optional, directory containing weights to reload before training")
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     # Set the random seed for the whole graph for reproductible experiments
27 |     tf.set_random_seed(230)
28 | 
29 |     # Load the parameters from the experiment params.json file in model_dir
30 |     args = parser.parse_args()
31 |     json_path = os.path.join(args.model_dir, 'params.json')
32 |     assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
33 |     params = Params(json_path)
34 | 
35 |     # Load the parameters from the dataset, that gives the size etc. into params
36 |     json_path = os.path.join(args.data_dir, 'dataset_params.json')
37 |     assert os.path.isfile(json_path), "No json file found at {}, run build_vocab.py".format(json_path)
38 |     params.update(json_path)
39 |     num_oov_buckets = params.num_oov_buckets # number of buckets for unknown words
40 | 
41 |     # Check that we are not overwriting some previous experiment
42 |     # Comment these lines if you are developing your model and don't care about overwritting
43 |     model_dir_has_best_weights = os.path.isdir(os.path.join(args.model_dir, "best_weights"))
44 |     overwritting = model_dir_has_best_weights and args.restore_dir is None
45 |     assert not overwritting, "Weights found in model_dir, aborting to avoid overwrite"
46 | 
47 |     # Set the logger
48 |     set_logger(os.path.join(args.model_dir, 'train.log'))
49 | 
50 |     # Get paths for vocabularies and dataset
51 |     path_words = os.path.join(args.data_dir, 'words.txt')
52 |     path_tags = os.path.join(args.data_dir, 'tags.txt')
53 |     path_train_sentences = os.path.join(args.data_dir, 'train/sentences.txt')
54 |     path_train_labels = os.path.join(args.data_dir, 'train/labels.txt')
55 |     path_eval_sentences = os.path.join(args.data_dir, 'dev/sentences.txt')
56 |     path_eval_labels = os.path.join(args.data_dir, 'dev/labels.txt')
57 | 
58 |     # Load Vocabularies
59 |     words = tf.contrib.lookup.index_table_from_file(path_words, num_oov_buckets=num_oov_buckets)
60 |     tags = tf.contrib.lookup.index_table_from_file(path_tags)
61 | 
62 |     # Create the input data pipeline
63 |     logging.info("Creating the datasets...")
64 |     train_sentences = load_dataset_from_text(path_train_sentences, words)
65 |     train_labels = load_dataset_from_text(path_train_labels, tags)
66 |     eval_sentences = load_dataset_from_text(path_eval_sentences, words)
67 |     eval_labels = load_dataset_from_text(path_eval_labels, tags)
68 | 
69 |     # Specify other parameters for the dataset and the model
70 |     params.eval_size = params.dev_size
71 |     params.buffer_size = params.train_size # buffer size for shuffling
72 |     params.id_pad_word = words.lookup(tf.constant(params.pad_word))
73 |     params.id_pad_tag = tags.lookup(tf.constant(params.pad_tag))
74 | 
75 |     # Create the two iterators over the two datasets
76 |     train_inputs = input_fn('train', train_sentences, train_labels, params)
77 |     eval_inputs = input_fn('eval', eval_sentences, eval_labels, params)
78 |     logging.info("- done.")
79 | 
80 |     # Define the models (2 different set of nodes that share weights for train and eval)
81 |     logging.info("Creating the model...")
82 |     train_model_spec = model_fn('train', train_inputs, params)
83 |     eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True)
84 |     logging.info("- done.")
85 | 
86 |     # Train the model
87 |     logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
88 |     train_and_evaluate(train_model_spec, eval_model_spec, args.model_dir, params, args.restore_dir)


--------------------------------------------------------------------------------
/tensorflow/vision/README.md:
--------------------------------------------------------------------------------
  1 | # Hand Signs Recognition with Tensorflow
  2 | 
  3 | _Authors: Olivier Moindrot and Guillaume Genthial_
  4 | 
  5 | Take the time to read the [tutorials](https://cs230-stanford.github.io).
  6 | 
  7 | Note: all scripts must be run in folder `tensorflow/vision`.
  8 | 
  9 | ## Requirements
 10 | 
 11 | We recommend using python3 and a virtual env. See instructions [here](https://cs230-stanford.github.io/project-starter-code.html).
 12 | 
 13 | ```
 14 | virtualenv -p python3 .env
 15 | source .env/bin/activate
 16 | pip install -r requirements.txt
 17 | ```
 18 | 
 19 | When you're done working on the project, deactivate the virtual environment with `deactivate`.
 20 | 
 21 | ## Task
 22 | 
 23 | Given an image of a hand doing a sign representing 0, 1, 2, 3, 4 or 5, predict the correct label.
 24 | 
 25 | ## Download the SIGNS dataset
 26 | 
 27 | For the vision example, we will used the SIGNS dataset created for this class. The dataset is hosted on google drive, download it [here][signs].
 28 | 
 29 | This will download the SIGNS dataset (~1.1 GB) containing photos of hands signs making numbers between 0 and 5.
 30 | Here is the structure of the data:
 31 | 
 32 | ```
 33 | SIGNS/
 34 |     train_signs/
 35 |         0_IMG_5864.jpg
 36 |         ...
 37 |     test_signs/
 38 |         0_IMG_5942.jpg
 39 |         ...
 40 | ```
 41 | 
 42 | The images are named following `{label}_IMG_{id}.jpg` where the label is in `[0, 5]`.
 43 | The training set contains 1,080 images and the test set contains 120 images.
 44 | 
 45 | Once the download is complete, move the dataset into `data/SIGNS`.
 46 | Run the script `build_dataset.py` which will resize the images to size `(64, 64)`. The new reiszed dataset will be located by default in `data/64x64_SIGNS`:
 47 | 
 48 | ```bash
 49 | python build_dataset.py --data_dir data/SIGNS --output_dir data/64x64_SIGNS
 50 | ```
 51 | 
 52 | ## Quickstart (~10 min)
 53 | 
 54 | 1. **Build the dataset of size 64x64**: make sure you complete this step before training
 55 | 
 56 | ```bash
 57 | python build_dataset.py --data_dir data/SIGNS\ dataset/ --output_dir data/64x64_SIGNS
 58 | ```
 59 | 
 60 | 2. **Your first experiment** We created a `base_model` directory for you under the `experiments` directory. It countains a file `params.json` which sets the parameters for the experiment. It looks like
 61 | 
 62 | ```json
 63 | {
 64 |     "learning_rate": 1e-3,
 65 |     "batch_size": 32,
 66 |     "num_epochs": 10,
 67 |     ...
 68 | }
 69 | ```
 70 | 
 71 | For every new experiment, you will need to create a new directory under `experiments` with a similar `params.json` file.
 72 | 
 73 | 3. **Train** your experiment. Simply run
 74 | 
 75 | ```
 76 | python train.py --data_dir data/64x64_SIGNS --model_dir experiments/base_model
 77 | ```
 78 | 
 79 | It will instantiate a model and train it on the training set following the parameters specified in `params.json`. It will also evaluate some metrics on the development set.
 80 | 
 81 | 4. **Your first hyperparameters search** We created a new directory `learning_rate` in `experiments` for you. Now, run
 82 | 
 83 | ```
 84 | python search_hyperparams.py --data_dir data/64x64_SIGNS --parent_dir experiments/learning_rate
 85 | ```
 86 | 
 87 | It will train and evaluate a model with different values of learning rate defined in `search_hyperparams.py` and create a new directory for each experiment under `experiments/learning_rate/`.
 88 | 
 89 | 5. **Display the results** of the hyperparameters search in a nice format
 90 | 
 91 | ```
 92 | python synthesize_results.py --parent_dir experiments/learning_rate
 93 | ```
 94 | 
 95 | 6. **Evaluation on the test set** Once you've run many experiments and selected your best model and hyperparameters based on the performance on the development set, you can finally evaluate the performance of your model on the test set. Run
 96 | 
 97 | ```
 98 | python evaluate.py --data_dir data/64x64_SIGNS --model_dir experiments/base_model
 99 | ```
100 | 
101 | ## Guidelines for more advanced use
102 | 
103 | We recommend reading through `train.py` to get a high-level overview of the steps:
104 | 
105 | - loading the hyperparameters for the experiment (the `params.json`)
106 | - getting the filenames / labels
107 | - creating the input of our model by zipping the filenames and labels together (`input_fn(...)`), reading the images as well as performing batching and shuffling.
108 | - creating the model (=nodes / ops of the `tf.Graph()`) by calling `model_fn(...)`
109 | - training the model for a given number of epochs by calling `train_and_evaluate(...)`
110 | 
111 | Once you get the high-level idea, depending on your dataset, you might want to modify
112 | 
113 | - `model/model_fn.py` to change the model
114 | - `model/input_fn.py` to change the way you read data
115 | - `train.py` and `evaluate.py` if somes changes in the model or input require changes here
116 | 
117 | If you want to compute new metrics for which you can find a [tensorflow implementation](https://www.tensorflow.org/api_docs/python/tf/metrics), you can define it in the `model_fn.py` (add it to the `metrics` dictionnary). It will automatically be updated during the training and will be displayed at the end of each epoch.
118 | 
119 | Once you get something working for your dataset, feel free to edit any part of the code to suit your own needs.
120 | 
121 | ## Resources
122 | 
123 | Note that this repository uses Tensorflow 1.14. Tensorflow 2 has just been
124 | released, so the links below now point to Tensorflow 2 documentation. You can
125 | navigate to the old 1.14 docs through the API dropdown on navigation bar.
126 | 
127 | There are major changes between TF 1 and TF 2, most notably Eager Execution
128 | being the new default mode. If your team is starting with a new project, we
129 | recommend using Tensorflow 2.
130 | 
131 | Introduction to the `tf.data` pipeline
132 | 
133 | - [programmer's guide](https://www.tensorflow.org/programmers_guide/datasets)
134 | - [reading images](https://www.tensorflow.org/programmers_guide/datasets#decoding_image_data_and_resizing_it)
135 | 
136 | [signs]: https://drive.google.com/file/d/1ufiR6hUKhXoAyiBNsySPkUwlvE_wfEHC/view?usp=sharing
137 | 


--------------------------------------------------------------------------------
/tensorflow/vision/build_dataset.py:
--------------------------------------------------------------------------------
 1 | """Split the SIGNS dataset into train/dev/test and resize images to 64x64.
 2 | 
 3 | The SIGNS dataset comes in the following format:
 4 |     train_signs/
 5 |         0_IMG_5864.jpg
 6 |         ...
 7 |     test_signs/
 8 |         0_IMG_5942.jpg
 9 |         ...
10 | 
11 | Original images have size (3024, 3024).
12 | Resizing to (64, 64) reduces the dataset size from 1.16 GB to 4.7 MB, and loading smaller images
13 | makes training faster.
14 | 
15 | We already have a test set created, so we only need to split "train_signs" into train and dev sets.
16 | Because we don't have a lot of images and we want that the statistics on the dev set be as
17 | representative as possible, we'll take 20% of "train_signs" as dev set.
18 | """
19 | 
20 | import argparse
21 | import random
22 | import os
23 | 
24 | from PIL import Image
25 | from tqdm import tqdm
26 | 
27 | 
28 | SIZE = 64
29 | 
30 | parser = argparse.ArgumentParser()
31 | parser.add_argument('--data_dir', default='data/SIGNS', help="Directory with the SIGNS dataset")
32 | parser.add_argument('--output_dir', default='data/64x64_SIGNS', help="Where to write the new data")
33 | 
34 | 
35 | def resize_and_save(filename, output_dir, size=SIZE):
36 |     """Resize the image contained in `filename` and save it to the `output_dir`"""
37 |     image = Image.open(filename)
38 |     # Use bilinear interpolation instead of the default "nearest neighbor" method
39 |     image = image.resize((size, size), Image.BILINEAR)
40 |     image.save(os.path.join(output_dir, filename.split('/')[-1]))
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     args = parser.parse_args()
45 | 
46 |     assert os.path.isdir(args.data_dir), "Couldn't find the dataset at {}".format(args.data_dir)
47 | 
48 |     # Define the data directories
49 |     train_data_dir = os.path.join(args.data_dir, 'train_signs')
50 |     test_data_dir = os.path.join(args.data_dir, 'test_signs')
51 | 
52 |     # Get the filenames in each directory (train and test)
53 |     filenames = os.listdir(train_data_dir)
54 |     filenames = [os.path.join(train_data_dir, f) for f in filenames if f.endswith('.jpg')]
55 | 
56 |     test_filenames = os.listdir(test_data_dir)
57 |     test_filenames = [os.path.join(test_data_dir, f) for f in test_filenames if f.endswith('.jpg')]
58 | 
59 |     # Split the images in 'train_signs' into 80% train and 20% dev
60 |     # Make sure to always shuffle with a fixed seed so that the split is reproducible
61 |     random.seed(230)
62 |     filenames.sort()
63 |     random.shuffle(filenames)
64 | 
65 |     split = int(0.8 * len(filenames))
66 |     train_filenames = filenames[:split]
67 |     dev_filenames = filenames[split:]
68 | 
69 |     filenames = {'train': train_filenames,
70 |                  'dev': dev_filenames,
71 |                  'test': test_filenames}
72 | 
73 |     if not os.path.exists(args.output_dir):
74 |         os.mkdir(args.output_dir)
75 |     else:
76 |         print("Warning: output dir {} already exists".format(args.output_dir))
77 | 
78 |     # Preprocess train, dev and test
79 |     for split in ['train', 'dev', 'test']:
80 |         output_dir_split = os.path.join(args.output_dir, '{}_signs'.format(split))
81 |         if not os.path.exists(output_dir_split):
82 |             os.mkdir(output_dir_split)
83 |         else:
84 |             print("Warning: dir {} already exists".format(output_dir_split))
85 | 
86 |         print("Processing {} data, saving preprocessed data to {}".format(split, output_dir_split))
87 |         for filename in tqdm(filenames[split]):
88 |             resize_and_save(filename, output_dir_split, size=SIZE)
89 | 
90 |     print("Done building dataset")
91 | 


--------------------------------------------------------------------------------
/tensorflow/vision/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/vision/data/.gitkeep


--------------------------------------------------------------------------------
/tensorflow/vision/evaluate.py:
--------------------------------------------------------------------------------
 1 | """Evaluate the model"""
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | from model.input_fn import input_fn
10 | from model.model_fn import model_fn
11 | from model.evaluation import evaluate
12 | from model.utils import Params
13 | from model.utils import set_logger
14 | 
15 | 
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument('--model_dir', default='experiments/test',
18 |                     help="Experiment directory containing params.json")
19 | parser.add_argument('--data_dir', default='data/64x64_SIGNS',
20 |                     help="Directory containing the dataset")
21 | parser.add_argument('--restore_from', default='best_weights',
22 |                     help="Subdirectory of model dir or file containing the weights")
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     # Set the random seed for the whole graph
27 |     tf.set_random_seed(230)
28 | 
29 |     # Load the parameters
30 |     args = parser.parse_args()
31 |     json_path = os.path.join(args.model_dir, 'params.json')
32 |     assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
33 |     params = Params(json_path)
34 | 
35 |     # Set the logger
36 |     set_logger(os.path.join(args.model_dir, 'evaluate.log'))
37 | 
38 |     # Create the input data pipeline
39 |     logging.info("Creating the dataset...")
40 |     data_dir = args.data_dir
41 |     test_data_dir = os.path.join(data_dir, "test_signs")
42 | 
43 |     # Get the filenames from the test set
44 |     test_filenames = os.listdir(test_data_dir)
45 |     test_filenames = [os.path.join(test_data_dir, f) for f in test_filenames if f.endswith('.jpg')]
46 | 
47 |     test_labels = [int(f.split('/')[-1][0]) for f in test_filenames]
48 | 
49 |     # specify the size of the evaluation set
50 |     params.eval_size = len(test_filenames)
51 | 
52 |     # create the iterator over the dataset
53 |     test_inputs = input_fn(False, test_filenames, test_labels, params)
54 | 
55 |     # Define the model
56 |     logging.info("Creating the model...")
57 |     model_spec = model_fn('eval', test_inputs, params, reuse=False)
58 | 
59 |     logging.info("Starting evaluation")
60 |     evaluate(model_spec, args.model_dir, params, args.restore_from)
61 | 


--------------------------------------------------------------------------------
/tensorflow/vision/experiments/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/vision/experiments/.gitkeep


--------------------------------------------------------------------------------
/tensorflow/vision/experiments/base_model/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "learning_rate": 1e-3,
 3 |     "batch_size": 32,
 4 |     "num_epochs": 10,
 5 | 
 6 |     "num_channels": 16,
 7 |     "use_batch_norm": true,
 8 |     "bn_momentum": 0.9,
 9 | 
10 |     "image_size": 64,
11 |     "use_random_flip": true,
12 |     "num_labels": 6,
13 | 
14 |     "num_parallel_calls": 4,
15 |     "save_summary_steps": 1
16 | }
17 | 


--------------------------------------------------------------------------------
/tensorflow/vision/experiments/learning_rate/params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "learning_rate": 1e-3,
 3 |     "batch_size": 32,
 4 |     "num_epochs": 10,
 5 | 
 6 |     "num_channels": 16,
 7 |     "use_batch_norm": true,
 8 |     "bn_momentum": 0.9,
 9 | 
10 |     "image_size": 64,
11 |     "use_random_flip": true,
12 |     "num_labels": 6,
13 | 
14 |     "num_parallel_calls": 4,
15 |     "save_summary_steps": 1
16 | }
17 | 


--------------------------------------------------------------------------------
/tensorflow/vision/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs230-stanford/cs230-code-examples/478e747b1c8bf57c6e2ce6b7ffd8068fe0287056/tensorflow/vision/model/__init__.py


--------------------------------------------------------------------------------
/tensorflow/vision/model/evaluation.py:
--------------------------------------------------------------------------------
 1 | """Tensorflow utility functions for evaluation"""
 2 | 
 3 | import logging
 4 | import os
 5 | 
 6 | from tqdm import trange
 7 | import tensorflow as tf
 8 | 
 9 | from model.utils import save_dict_to_json
10 | 
11 | 
12 | def evaluate_sess(sess, model_spec, num_steps, writer=None, params=None):
13 |     """Train the model on `num_steps` batches.
14 | 
15 |     Args:
16 |         sess: (tf.Session) current session
17 |         model_spec: (dict) contains the graph operations or nodes needed for training
18 |         num_steps: (int) train for this number of batches
19 |         writer: (tf.summary.FileWriter) writer for summaries. Is None if we don't log anything
20 |         params: (Params) hyperparameters
21 |     """
22 |     update_metrics = model_spec['update_metrics']
23 |     eval_metrics = model_spec['metrics']
24 |     global_step = tf.train.get_global_step()
25 | 
26 |     # Load the evaluation dataset into the pipeline and initialize the metrics init op
27 |     sess.run(model_spec['iterator_init_op'])
28 |     sess.run(model_spec['metrics_init_op'])
29 | 
30 |     # compute metrics over the dataset
31 |     for _ in range(num_steps):
32 |         sess.run(update_metrics)
33 | 
34 |     # Get the values of the metrics
35 |     metrics_values = {k: v[0] for k, v in eval_metrics.items()}
36 |     metrics_val = sess.run(metrics_values)
37 |     metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_val.items())
38 |     logging.info("- Eval metrics: " + metrics_string)
39 | 
40 |     # Add summaries manually to writer at global_step_val
41 |     if writer is not None:
42 |         global_step_val = sess.run(global_step)
43 |         for tag, val in metrics_val.items():
44 |             summ = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)])
45 |             writer.add_summary(summ, global_step_val)
46 | 
47 |     return metrics_val
48 | 
49 | 
50 | def evaluate(model_spec, model_dir, params, restore_from):
51 |     """Evaluate the model
52 | 
53 |     Args:
54 |         model_spec: (dict) contains the graph operations or nodes needed for evaluation
55 |         model_dir: (string) directory containing config, weights and log
56 |         params: (Params) contains hyperparameters of the model.
57 |                 Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
58 |         restore_from: (string) directory or file containing weights to restore the graph
59 |     """
60 |     # Initialize tf.Saver
61 |     saver = tf.train.Saver()
62 | 
63 |     with tf.Session() as sess:
64 |         # Initialize the lookup table
65 |         sess.run(model_spec['variable_init_op'])
66 | 
67 |         # Reload weights from the weights subdirectory
68 |         save_path = os.path.join(model_dir, restore_from)
69 |         if os.path.isdir(save_path):
70 |             save_path = tf.train.latest_checkpoint(save_path)
71 |         saver.restore(sess, save_path)
72 | 
73 |         # Evaluate
74 |         num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size
75 |         metrics = evaluate_sess(sess, model_spec, num_steps)
76 |         metrics_name = '_'.join(restore_from.split('/'))
77 |         save_path = os.path.join(model_dir, "metrics_test_{}.json".format(metrics_name))
78 |         save_dict_to_json(metrics, save_path)
79 | 


--------------------------------------------------------------------------------
/tensorflow/vision/model/input_fn.py:
--------------------------------------------------------------------------------
 1 | """Create the input data pipeline using `tf.data`"""
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def _parse_function(filename, label, size):
 7 |     """Obtain the image from the filename (for both training and validation).
 8 | 
 9 |     The following operations are applied:
10 |         - Decode the image from jpeg format
11 |         - Convert to float and to range [0, 1]
12 |     """
13 |     image_string = tf.read_file(filename)
14 | 
15 |     # Don't use tf.image.decode_image, or the output shape will be undefined
16 |     image_decoded = tf.image.decode_jpeg(image_string, channels=3)
17 | 
18 |     # This will convert to float values in [0, 1]
19 |     image = tf.image.convert_image_dtype(image_decoded, tf.float32)
20 | 
21 |     resized_image = tf.image.resize_images(image, [size, size])
22 | 
23 |     return resized_image, label
24 | 
25 | 
26 | def train_preprocess(image, label, use_random_flip):
27 |     """Image preprocessing for training.
28 | 
29 |     Apply the following operations:
30 |         - Horizontally flip the image with probability 1/2
31 |         - Apply random brightness and saturation
32 |     """
33 |     if use_random_flip:
34 |         image = tf.image.random_flip_left_right(image)
35 | 
36 |     image = tf.image.random_brightness(image, max_delta=32.0 / 255.0)
37 |     image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
38 | 
39 |     # Make sure the image is still in [0, 1]
40 |     image = tf.clip_by_value(image, 0.0, 1.0)
41 | 
42 |     return image, label
43 | 
44 | 
45 | def input_fn(is_training, filenames, labels, params):
46 |     """Input function for the SIGNS dataset.
47 | 
48 |     The filenames have format "{label}_IMG_{id}.jpg".
49 |     For instance: "data_dir/2_IMG_4584.jpg".
50 | 
51 |     Args:
52 |         is_training: (bool) whether to use the train or test pipeline.
53 |                      At training, we shuffle the data and have multiple epochs
54 |         filenames: (list) filenames of the images, as ["data_dir/{label}_IMG_{id}.jpg"...]
55 |         labels: (list) corresponding list of labels
56 |         params: (Params) contains hyperparameters of the model (ex: `params.num_epochs`)
57 |     """
58 |     num_samples = len(filenames)
59 |     assert len(filenames) == len(labels), "Filenames and labels should have same length"
60 | 
61 |     # Create a Dataset serving batches of images and labels
62 |     # We don't repeat for multiple epochs because we always train and evaluate for one epoch
63 |     parse_fn = lambda f, l: _parse_function(f, l, params.image_size)
64 |     train_fn = lambda f, l: train_preprocess(f, l, params.use_random_flip)
65 | 
66 |     if is_training:
67 |         dataset = (tf.data.Dataset.from_tensor_slices((tf.constant(filenames), tf.constant(labels)))
68 |             .shuffle(num_samples)  # whole dataset into the buffer ensures good shuffling
69 |             .map(parse_fn, num_parallel_calls=params.num_parallel_calls)
70 |             .map(train_fn, num_parallel_calls=params.num_parallel_calls)
71 |             .batch(params.batch_size)
72 |             .prefetch(1)  # make sure you always have one batch ready to serve
73 |         )
74 |     else:
75 |         dataset = (tf.data.Dataset.from_tensor_slices((tf.constant(filenames), tf.constant(labels)))
76 |             .map(parse_fn)
77 |             .batch(params.batch_size)
78 |             .prefetch(1)  # make sure you always have one batch ready to serve
79 |         )
80 | 
81 |     # Create reinitializable iterator from dataset
82 |     iterator = dataset.make_initializable_iterator()
83 |     images, labels = iterator.get_next()
84 |     iterator_init_op = iterator.initializer
85 | 
86 |     inputs = {'images': images, 'labels': labels, 'iterator_init_op': iterator_init_op}
87 |     return inputs
88 | 


--------------------------------------------------------------------------------
/tensorflow/vision/model/model_fn.py:
--------------------------------------------------------------------------------
  1 | """Define the model."""
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | def build_model(is_training, inputs, params):
  7 |     """Compute logits of the model (output distribution)
  8 | 
  9 |     Args:
 10 |         is_training: (bool) whether we are training or not
 11 |         inputs: (dict) contains the inputs of the graph (features, labels...)
 12 |                 this can be `tf.placeholder` or outputs of `tf.data`
 13 |         params: (Params) hyperparameters
 14 | 
 15 |     Returns:
 16 |         output: (tf.Tensor) output of the model
 17 |     """
 18 |     images = inputs['images']
 19 | 
 20 |     assert images.get_shape().as_list() == [None, params.image_size, params.image_size, 3]
 21 | 
 22 |     out = images
 23 |     # Define the number of channels of each convolution
 24 |     # For each block, we do: 3x3 conv -> batch norm -> relu -> 2x2 maxpool
 25 |     num_channels = params.num_channels
 26 |     bn_momentum = params.bn_momentum
 27 |     channels = [num_channels, num_channels * 2, num_channels * 4, num_channels * 8]
 28 |     for i, c in enumerate(channels):
 29 |         with tf.variable_scope('block_{}'.format(i+1)):
 30 |             out = tf.layers.conv2d(out, c, 3, padding='same')
 31 |             if params.use_batch_norm:
 32 |                 out = tf.layers.batch_normalization(out, momentum=bn_momentum, training=is_training)
 33 |             out = tf.nn.relu(out)
 34 |             out = tf.layers.max_pooling2d(out, 2, 2)
 35 | 
 36 |     assert out.get_shape().as_list() == [None, 4, 4, num_channels * 8]
 37 | 
 38 |     out = tf.reshape(out, [-1, 4 * 4 * num_channels * 8])
 39 |     with tf.variable_scope('fc_1'):
 40 |         out = tf.layers.dense(out, num_channels * 8)
 41 |         if params.use_batch_norm:
 42 |             out = tf.layers.batch_normalization(out, momentum=bn_momentum, training=is_training)
 43 |         out = tf.nn.relu(out)
 44 |     with tf.variable_scope('fc_2'):
 45 |         logits = tf.layers.dense(out, params.num_labels)
 46 | 
 47 |     return logits
 48 | 
 49 | 
 50 | def model_fn(mode, inputs, params, reuse=False):
 51 |     """Model function defining the graph operations.
 52 | 
 53 |     Args:
 54 |         mode: (string) can be 'train' or 'eval'
 55 |         inputs: (dict) contains the inputs of the graph (features, labels...)
 56 |                 this can be `tf.placeholder` or outputs of `tf.data`
 57 |         params: (Params) contains hyperparameters of the model (ex: `params.learning_rate`)
 58 |         reuse: (bool) whether to reuse the weights
 59 | 
 60 |     Returns:
 61 |         model_spec: (dict) contains the graph operations or nodes needed for training / evaluation
 62 |     """
 63 |     is_training = (mode == 'train')
 64 |     labels = inputs['labels']
 65 |     labels = tf.cast(labels, tf.int64)
 66 | 
 67 |     # -----------------------------------------------------------
 68 |     # MODEL: define the layers of the model
 69 |     with tf.variable_scope('model', reuse=reuse):
 70 |         # Compute the output distribution of the model and the predictions
 71 |         logits = build_model(is_training, inputs, params)
 72 |         predictions = tf.argmax(logits, 1)
 73 | 
 74 |     # Define loss and accuracy
 75 |     loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 76 |     accuracy = tf.reduce_mean(tf.cast(tf.equal(labels, predictions), tf.float32))
 77 | 
 78 |     # Define training step that minimizes the loss with the Adam optimizer
 79 |     if is_training:
 80 |         optimizer = tf.train.AdamOptimizer(params.learning_rate)
 81 |         global_step = tf.train.get_or_create_global_step()
 82 |         if params.use_batch_norm:
 83 |             # Add a dependency to update the moving mean and variance for batch normalization
 84 |             with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
 85 |                 train_op = optimizer.minimize(loss, global_step=global_step)
 86 |         else:
 87 |             train_op = optimizer.minimize(loss, global_step=global_step)
 88 | 
 89 | 
 90 |     # -----------------------------------------------------------
 91 |     # METRICS AND SUMMARIES
 92 |     # Metrics for evaluation using tf.metrics (average over whole dataset)
 93 |     with tf.variable_scope("metrics"):
 94 |         metrics = {
 95 |             'accuracy': tf.metrics.accuracy(labels=labels, predictions=tf.argmax(logits, 1)),
 96 |             'loss': tf.metrics.mean(loss)
 97 |         }
 98 | 
 99 |     # Group the update ops for the tf.metrics
100 |     update_metrics_op = tf.group(*[op for _, op in metrics.values()])
101 | 
102 |     # Get the op to reset the local variables used in tf.metrics
103 |     metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics")
104 |     metrics_init_op = tf.variables_initializer(metric_variables)
105 | 
106 |     # Summaries for training
107 |     tf.summary.scalar('loss', loss)
108 |     tf.summary.scalar('accuracy', accuracy)
109 |     tf.summary.image('train_image', inputs['images'])
110 | 
111 |     #TODO: if mode == 'eval': ?
112 |     # Add incorrectly labeled images
113 |     mask = tf.not_equal(labels, predictions)
114 | 
115 |     # Add a different summary to know how they were misclassified
116 |     for label in range(0, params.num_labels):
117 |         mask_label = tf.logical_and(mask, tf.equal(predictions, label))
118 |         incorrect_image_label = tf.boolean_mask(inputs['images'], mask_label)
119 |         tf.summary.image('incorrectly_labeled_{}'.format(label), incorrect_image_label)
120 | 
121 |     # -----------------------------------------------------------
122 |     # MODEL SPECIFICATION
123 |     # Create the model specification and return it
124 |     # It contains nodes or operations in the graph that will be used for training and evaluation
125 |     model_spec = inputs
126 |     model_spec['variable_init_op'] = tf.global_variables_initializer()
127 |     model_spec["predictions"] = predictions
128 |     model_spec['loss'] = loss
129 |     model_spec['accuracy'] = accuracy
130 |     model_spec['metrics_init_op'] = metrics_init_op
131 |     model_spec['metrics'] = metrics
132 |     model_spec['update_metrics'] = update_metrics_op
133 |     model_spec['summary_op'] = tf.summary.merge_all()
134 | 
135 |     if is_training:
136 |         model_spec['train_op'] = train_op
137 | 
138 |     return model_spec
139 | 


--------------------------------------------------------------------------------
/tensorflow/vision/model/training.py:
--------------------------------------------------------------------------------
  1 | """Tensorflow utility functions for training"""
  2 | 
  3 | import logging
  4 | import os
  5 | 
  6 | from tqdm import trange
  7 | import tensorflow as tf
  8 | 
  9 | from model.utils import save_dict_to_json
 10 | from model.evaluation import evaluate_sess
 11 | 
 12 | 
 13 | def train_sess(sess, model_spec, num_steps, writer, params):
 14 |     """Train the model on `num_steps` batches
 15 | 
 16 |     Args:
 17 |         sess: (tf.Session) current session
 18 |         model_spec: (dict) contains the graph operations or nodes needed for training
 19 |         num_steps: (int) train for this number of batches
 20 |         writer: (tf.summary.FileWriter) writer for summaries
 21 |         params: (Params) hyperparameters
 22 |     """
 23 |     # Get relevant graph operations or nodes needed for training
 24 |     loss = model_spec['loss']
 25 |     train_op = model_spec['train_op']
 26 |     update_metrics = model_spec['update_metrics']
 27 |     metrics = model_spec['metrics']
 28 |     summary_op = model_spec['summary_op']
 29 |     global_step = tf.train.get_global_step()
 30 | 
 31 |     # Load the training dataset into the pipeline and initialize the metrics local variables
 32 |     sess.run(model_spec['iterator_init_op'])
 33 |     sess.run(model_spec['metrics_init_op'])
 34 | 
 35 |     # Use tqdm for progress bar
 36 |     t = trange(num_steps)
 37 |     for i in t:
 38 |         # Evaluate summaries for tensorboard only once in a while
 39 |         if i % params.save_summary_steps == 0:
 40 |             # Perform a mini-batch update
 41 |             _, _, loss_val, summ, global_step_val = sess.run([train_op, update_metrics, loss,
 42 |                                                               summary_op, global_step])
 43 |             # Write summaries for tensorboard
 44 |             writer.add_summary(summ, global_step_val)
 45 |         else:
 46 |             _, _, loss_val = sess.run([train_op, update_metrics, loss])
 47 |         # Log the loss in the tqdm progress bar
 48 |         t.set_postfix(loss='{:05.3f}'.format(loss_val))
 49 | 
 50 | 
 51 |     metrics_values = {k: v[0] for k, v in metrics.items()}
 52 |     metrics_val = sess.run(metrics_values)
 53 |     metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_val.items())
 54 |     logging.info("- Train metrics: " + metrics_string)
 55 | 
 56 | 
 57 | def train_and_evaluate(train_model_spec, eval_model_spec, model_dir, params, restore_from=None):
 58 |     """Train the model and evaluate every epoch.
 59 | 
 60 |     Args:
 61 |         train_model_spec: (dict) contains the graph operations or nodes needed for training
 62 |         eval_model_spec: (dict) contains the graph operations or nodes needed for evaluation
 63 |         model_dir: (string) directory containing config, weights and log
 64 |         params: (Params) contains hyperparameters of the model.
 65 |                 Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps
 66 |         restore_from: (string) directory or file containing weights to restore the graph
 67 |     """
 68 |     # Initialize tf.Saver instances to save weights during training
 69 |     last_saver = tf.train.Saver() # will keep last 5 epochs
 70 |     best_saver = tf.train.Saver(max_to_keep=1)  # only keep 1 best checkpoint (best on eval)
 71 |     begin_at_epoch = 0
 72 | 
 73 |     with tf.Session() as sess:
 74 |         # Initialize model variables
 75 |         sess.run(train_model_spec['variable_init_op'])
 76 | 
 77 |         # Reload weights from directory if specified
 78 |         if restore_from is not None:
 79 |             logging.info("Restoring parameters from {}".format(restore_from))
 80 |             if os.path.isdir(restore_from):
 81 |                 restore_from = tf.train.latest_checkpoint(restore_from)
 82 |                 begin_at_epoch = int(restore_from.split('-')[-1])
 83 |             last_saver.restore(sess, restore_from)
 84 | 
 85 |         # For tensorboard (takes care of writing summaries to files)
 86 |         train_writer = tf.summary.FileWriter(os.path.join(model_dir, 'train_summaries'), sess.graph)
 87 |         eval_writer = tf.summary.FileWriter(os.path.join(model_dir, 'eval_summaries'), sess.graph)
 88 | 
 89 |         best_eval_acc = 0.0
 90 |         for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs):
 91 |             # Run one epoch
 92 |             logging.info("Epoch {}/{}".format(epoch + 1, begin_at_epoch + params.num_epochs))
 93 |             # Compute number of batches in one epoch (one full pass over the training set)
 94 |             num_steps = (params.train_size + params.batch_size - 1) // params.batch_size
 95 |             train_sess(sess, train_model_spec, num_steps, train_writer, params)
 96 | 
 97 |             # Save weights
 98 |             last_save_path = os.path.join(model_dir, 'last_weights', 'after-epoch')
 99 |             last_saver.save(sess, last_save_path, global_step=epoch + 1)
100 | 
101 |             # Evaluate for one epoch on validation set
102 |             num_steps = (params.eval_size + params.batch_size - 1) // params.batch_size
103 |             metrics = evaluate_sess(sess, eval_model_spec, num_steps, eval_writer)
104 | 
105 |             # If best_eval, best_save_path
106 |             eval_acc = metrics['accuracy']
107 |             if eval_acc >= best_eval_acc:
108 |                 # Store new best accuracy
109 |                 best_eval_acc = eval_acc
110 |                 # Save weights
111 |                 best_save_path = os.path.join(model_dir, 'best_weights', 'after-epoch')
112 |                 best_save_path = best_saver.save(sess, best_save_path, global_step=epoch + 1)
113 |                 logging.info("- Found new best accuracy, saving in {}".format(best_save_path))
114 |                 # Save best eval metrics in a json file in the model directory
115 |                 best_json_path = os.path.join(model_dir, "metrics_eval_best_weights.json")
116 |                 save_dict_to_json(metrics, best_json_path)
117 | 
118 |             # Save latest eval metrics in a json file in the model directory
119 |             last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json")
120 |             save_dict_to_json(metrics, last_json_path)
121 | 


--------------------------------------------------------------------------------
/tensorflow/vision/model/utils.py:
--------------------------------------------------------------------------------
 1 | """General utility functions"""
 2 | 
 3 | import json
 4 | import logging
 5 | 
 6 | 
 7 | class Params():
 8 |     """Class that loads hyperparameters from a json file.
 9 | 
10 |     Example:
11 |     ```
12 |     params = Params(json_path)
13 |     print(params.learning_rate)
14 |     params.learning_rate = 0.5  # change the value of learning_rate in params
15 |     ```
16 |     """
17 | 
18 |     def __init__(self, json_path):
19 |         self.update(json_path)
20 | 
21 |     def save(self, json_path):
22 |         """Saves parameters to json file"""
23 |         with open(json_path, 'w') as f:
24 |             json.dump(self.__dict__, f, indent=4)
25 | 
26 |     def update(self, json_path):
27 |         """Loads parameters from json file"""
28 |         with open(json_path) as f:
29 |             params = json.load(f)
30 |             self.__dict__.update(params)
31 | 
32 |     @property
33 |     def dict(self):
34 |         """Gives dict-like access to Params instance by `params.dict['learning_rate']`"""
35 |         return self.__dict__
36 | 
37 | 
38 | def set_logger(log_path):
39 |     """Sets the logger to log info in terminal and file `log_path`.
40 | 
41 |     In general, it is useful to have a logger so that every output to the terminal is saved
42 |     in a permanent file. Here we save it to `model_dir/train.log`.
43 | 
44 |     Example:
45 |     ```
46 |     logging.info("Starting training...")
47 |     ```
48 | 
49 |     Args:
50 |         log_path: (string) where to log
51 |     """
52 |     logger = logging.getLogger()
53 |     logger.setLevel(logging.INFO)
54 | 
55 |     if not logger.handlers:
56 |         # Logging to a file
57 |         file_handler = logging.FileHandler(log_path)
58 |         file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
59 |         logger.addHandler(file_handler)
60 | 
61 |         # Logging to console
62 |         stream_handler = logging.StreamHandler()
63 |         stream_handler.setFormatter(logging.Formatter('%(message)s'))
64 |         logger.addHandler(stream_handler)
65 | 
66 | 
67 | def save_dict_to_json(d, json_path):
68 |     """Saves dict of floats in json file
69 | 
70 |     Args:
71 |         d: (dict) of float-castable values (np.float, int, float, etc.)
72 |         json_path: (string) path to json file
73 |     """
74 |     with open(json_path, 'w') as f:
75 |         # We need to convert the values to float for json (it doesn't accept np.array, np.float, )
76 |         d = {k: float(v) for k, v in d.items()}
77 |         json.dump(d, f, indent=4)
78 | 


--------------------------------------------------------------------------------
/tensorflow/vision/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | Pillow
3 | tensorflow==1.15.0
4 | tabulate
5 | tqdm
6 | 


--------------------------------------------------------------------------------
/tensorflow/vision/search_hyperparams.py:
--------------------------------------------------------------------------------
 1 | """Peform hyperparemeters search"""
 2 | 
 3 | import argparse
 4 | import os
 5 | from subprocess import check_call
 6 | import sys
 7 | 
 8 | from model.utils import Params
 9 | 
10 | 
11 | PYTHON = sys.executable
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--parent_dir', default='experiments/learning_rate',
14 |                     help="Directory containing params.json")
15 | parser.add_argument('--data_dir', default='data/64x64_SIGNS',
16 |                     help="Directory containing the dataset")
17 | 
18 | 
19 | def launch_training_job(parent_dir, data_dir, job_name, params):
20 |     """Launch training of the model with a set of hyperparameters in parent_dir/job_name
21 | 
22 |     Args:
23 |         parent_dir: (string) directory containing config, weights and log
24 |         data_dir: (string) directory containing the dataset
25 |         params: (dict) containing hyperparameters
26 |     """
27 |     # Create a new folder in parent_dir with unique_name "job_name"
28 |     model_dir = os.path.join(parent_dir, job_name)
29 |     if not os.path.exists(model_dir):
30 |         os.makedirs(model_dir)
31 | 
32 |     # Write parameters in json file
33 |     json_path = os.path.join(model_dir, 'params.json')
34 |     params.save(json_path)
35 | 
36 |     # Launch training with this config
37 |     cmd = "{python} train.py --model_dir {model_dir} --data_dir {data_dir}".format(python=PYTHON,
38 |             model_dir=model_dir, data_dir=data_dir)
39 |     print(cmd)
40 |     check_call(cmd, shell=True)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     # Load the "reference" parameters from parent_dir json file
45 |     args = parser.parse_args()
46 |     json_path = os.path.join(args.parent_dir, 'params.json')
47 |     assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
48 |     params = Params(json_path)
49 | 
50 |     # Perform hypersearch over one parameter
51 |     learning_rates = [1e-4, 1e-3, 1e-2]
52 | 
53 |     for learning_rate in learning_rates:
54 |         # Modify the relevant parameter in params
55 |         params.learning_rate = learning_rate
56 | 
57 |         # Launch job (name has to be unique)
58 |         job_name = "learning_rate_{}".format(learning_rate)
59 |         launch_training_job(args.parent_dir, args.data_dir, job_name, params)
60 | 


--------------------------------------------------------------------------------
/tensorflow/vision/synthesize_results.py:
--------------------------------------------------------------------------------
 1 | """Aggregates results from the metrics_eval_best_weights.json in a parent folder"""
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | 
 7 | from tabulate import tabulate
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--parent_dir', default='experiments',
12 |                     help='Directory containing results of experiments')
13 | 
14 | 
15 | def aggregate_metrics(parent_dir, metrics):
16 |     """Aggregate the metrics of all experiments in folder `parent_dir`.
17 | 
18 |     Assumes that `parent_dir` contains multiple experiments, with their results stored in
19 |     `parent_dir/subdir/metrics_dev.json`
20 | 
21 |     Args:
22 |         parent_dir: (string) path to directory containing experiments results
23 |         metrics: (dict) subdir -> {'accuracy': ..., ...}
24 |     """
25 |     # Get the metrics for the folder if it has results from an experiment
26 |     metrics_file = os.path.join(parent_dir, 'metrics_eval_best_weights.json')
27 |     if os.path.isfile(metrics_file):
28 |         with open(metrics_file, 'r') as f:
29 |             metrics[parent_dir] = json.load(f)
30 | 
31 |     # Check every subdirectory of parent_dir
32 |     for subdir in os.listdir(parent_dir):
33 |         if not os.path.isdir(os.path.join(parent_dir, subdir)):
34 |             continue
35 |         else:
36 |             aggregate_metrics(os.path.join(parent_dir, subdir), metrics)
37 | 
38 | 
39 | def metrics_to_table(metrics):
40 |     # Get the headers from the first subdir. Assumes everything has the same metrics
41 |     headers = metrics[list(metrics.keys())[0]].keys()
42 |     table = [[subdir] + [values[h] for h in headers] for subdir, values in metrics.items()]
43 |     res = tabulate(table, headers, tablefmt='pipe')
44 | 
45 |     return res
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     args = parser.parse_args()
50 | 
51 |     # Aggregate metrics from args.parent_dir directory
52 |     metrics = dict()
53 |     aggregate_metrics(args.parent_dir, metrics)
54 |     table = metrics_to_table(metrics)
55 | 
56 |     # Display the table to terminal
57 |     print(table)
58 | 
59 |     # Save results in parent_dir/results.md
60 |     save_file = os.path.join(args.parent_dir, "results.md")
61 |     with open(save_file, 'w') as f:
62 |         f.write(table)
63 | 


--------------------------------------------------------------------------------
/tensorflow/vision/train.py:
--------------------------------------------------------------------------------
 1 | """Train the model"""
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | import random
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | from model.input_fn import input_fn
11 | from model.utils import Params
12 | from model.utils import set_logger
13 | from model.utils import save_dict_to_json
14 | from model.model_fn import model_fn
15 | from model.training import train_and_evaluate
16 | 
17 | 
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('--model_dir', default='experiments/test',
20 |                     help="Experiment directory containing params.json")
21 | parser.add_argument('--data_dir', default='data/64x64_SIGNS',
22 |                     help="Directory containing the dataset")
23 | parser.add_argument('--restore_from', default=None,
24 |                     help="Optional, directory or file containing weights to reload before training")
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     # Set the random seed for the whole graph for reproductible experiments
29 |     tf.set_random_seed(230)
30 | 
31 |     # Load the parameters from json file
32 |     args = parser.parse_args()
33 |     json_path = os.path.join(args.model_dir, 'params.json')
34 |     assert os.path.isfile(
35 |         json_path), "No json configuration file found at {}".format(json_path)
36 |     params = Params(json_path)
37 | 
38 |     # Check that we are not overwriting some previous experiment
39 |     # Comment these lines if you are developing your model and don't care about overwritting
40 |     model_dir_has_best_weights = os.path.isdir(
41 |         os.path.join(args.model_dir, "best_weights"))
42 |     overwritting = model_dir_has_best_weights and args.restore_from is None
43 |     assert not overwritting, "Weights found in model_dir, aborting to avoid overwrite"
44 | 
45 |     # Set the logger
46 |     set_logger(os.path.join(args.model_dir, 'train.log'))
47 | 
48 |     # Create the input data pipeline
49 |     logging.info("Creating the datasets...")
50 |     data_dir = args.data_dir
51 |     train_data_dir = os.path.join(data_dir, "train_signs")
52 |     dev_data_dir = os.path.join(data_dir, "dev_signs")
53 | 
54 |     # Get the filenames from the train and dev sets
55 |     train_filenames = [os.path.join(train_data_dir, f) for f in os.listdir(train_data_dir)
56 |                        if f.endswith('.jpg')]
57 |     eval_filenames = [os.path.join(dev_data_dir, f) for f in os.listdir(dev_data_dir)
58 |                       if f.endswith('.jpg')]
59 | 
60 |     # Labels will be between 0 and 5 included (6 classes in total)
61 |     train_labels = [int(f.split('/')[-1][0]) for f in train_filenames]
62 |     eval_labels = [int(f.split('/')[-1][0]) for f in eval_filenames]
63 | 
64 |     # Specify the sizes of the dataset we train on and evaluate on
65 |     params.train_size = len(train_filenames)
66 |     params.eval_size = len(eval_filenames)
67 | 
68 |     # Create the two iterators over the two datasets
69 |     train_inputs = input_fn(True, train_filenames, train_labels, params)
70 |     eval_inputs = input_fn(False, eval_filenames, eval_labels, params)
71 | 
72 |     # Define the model
73 |     logging.info("Creating the model...")
74 |     train_model_spec = model_fn('train', train_inputs, params)
75 |     eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True)
76 | 
77 |     # Train the model
78 |     logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
79 |     train_and_evaluate(train_model_spec, eval_model_spec,
80 |                        args.model_dir, params, args.restore_from)
81 | 


--------------------------------------------------------------------------------