├── cloudml-gpu.yaml
├── cloudml-4gpu.yaml
├── cloudml-gpu-distributed.yaml
├── __init__.py
├── models.py
├── CONTRIBUTING.md
├── feature_extractor
    ├── feature_extractor_test.py
    ├── README.md
    ├── feature_extractor.py
    └── extract_tfrecords_main.py
├── convert_prediction_from_json_to_csv.py
├── model_utils.py
├── losses.py
├── video_level_models.py
├── mean_average_precision_calculator.py
├── export_model.py
├── utils.py
├── inference.py
├── eval_util.py
├── frame_level_models.py
├── average_precision_calculator.py
├── readers.py
├── LICENSE
├── eval.py
├── README.md
└── train.py


/cloudml-gpu.yaml:
--------------------------------------------------------------------------------
1 | trainingInput:
2 |   scaleTier: CUSTOM
3 |   masterType: standard_gpu
4 |   runtimeVersion: "1.0"
5 | 


--------------------------------------------------------------------------------
/cloudml-4gpu.yaml:
--------------------------------------------------------------------------------
1 | trainingInput:
2 |   scaleTier: CUSTOM
3 |   masterType: complex_model_m_gpu
4 |   runtimeVersion: "1.0"
5 | 


--------------------------------------------------------------------------------
/cloudml-gpu-distributed.yaml:
--------------------------------------------------------------------------------
1 | trainingInput:
2 |   runtimeVersion: "1.0"
3 |   scaleTier: CUSTOM
4 |   masterType: standard_gpu
5 |   workerCount: 2
6 |   workerType: standard_gpu
7 |   parameterServerCount: 2
8 |   parameterServerType: standard
9 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Contains the base class for models."""
16 | 
17 | class BaseModel(object):
18 |   """Inherit from this class when implementing new models."""
19 | 
20 |   def create_model(self, unused_model_input, **unused_params):
21 |     raise NotImplementedError()
22 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | We are accepting patches and contributions to this project. To set expectations,
 4 | this project is primarily intended to be a flexible starting point for
 5 | researchers working with the YouTube-8M dataset. As such, we would like to keep
 6 | it simple. We are more likely to accept small bug fixes and optimizations, and
 7 | less likely to accept patches which add significant complexity. For the latter
 8 | type of contribution, we recommend creating a Github fork of the project
 9 | instead.
10 | 
11 | If you would like to contribute, there are a few small guidelines you need to
12 | follow.
13 | 
14 | ## Contributor License Agreement
15 | 
16 | Contributions to any Google project must be accompanied by a Contributor License
17 | Agreement. This is necessary because you own the copyright to your changes, even
18 | after your contribution becomes part of this project. So this agreement simply
19 | gives us permission to use and redistribute your contributions as part of the
20 | project. Head over to <https://cla.developers.google.com/> to see your current
21 | agreements on file or to sign a new one.
22 | 
23 | You generally only need to submit a CLA once, so if you've already submitted one
24 | (even if it was for a different project), you probably don't need to do it
25 | again.
26 | 
27 | ## Code reviews
28 | 
29 | All submissions, including submissions by project members, require review. We
30 | use GitHub pull requests for this purpose. Consult [GitHub Help] for more
31 | information on using pull requests.
32 | 
33 | [GitHub Help]: https://help.github.com/articles/about-pull-requests/
34 | 


--------------------------------------------------------------------------------
/feature_extractor/feature_extractor_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests for feature_extractor."""
15 | 
16 | import pickle
17 | import json
18 | import os
19 | import feature_extractor
20 | import numpy
21 | from PIL import Image
22 | from tensorflow.python.platform import googletest
23 | 
24 | 
25 | def _FilePath(filename):
26 |   return os.path.join('testdata', filename)
27 | 
28 | 
29 | def _MeanElementWiseDifference(a, b):
30 |   """Calculates element-wise percent difference between two numpy matrices."""
31 |   difference = numpy.abs(a - b)
32 |   denominator = numpy.maximum(numpy.abs(a), numpy.abs(b))
33 | 
34 |   # We dont care if one is 0 and another is 0.01
35 |   return (difference / (0.01 + denominator)).mean()
36 | 
37 | 
38 | class FeatureExtractorTest(googletest.TestCase):
39 | 
40 |   def setUp(self):
41 |     self._extractor = feature_extractor.YouTube8MFeatureExtractor()
42 | 
43 |   def testPCAOnFeatureVector(self):
44 |     sports_1m_test_data = cPickle.load(open(_FilePath('sports1m_frame.pkl')))
45 |     actual_pca = self._extractor.apply_pca(sports_1m_test_data['original'])
46 |     expected_pca = sports_1m_test_data['pca']
47 |     self.assertLess(_MeanElementWiseDifference(actual_pca, expected_pca), 1e-5)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |   googletest.main()
52 | 


--------------------------------------------------------------------------------
/convert_prediction_from_json_to_csv.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Utility to convert the output of batch prediction into a CSV submission.
 16 | 
 17 | It converts the JSON files created by the command
 18 | 'gcloud beta ml jobs submit prediction' into a CSV file ready for submission.
 19 | """
 20 | 
 21 | import json
 22 | import tensorflow as tf
 23 | 
 24 | from builtins import range
 25 | from tensorflow import app
 26 | from tensorflow import flags
 27 | from tensorflow import gfile
 28 | from tensorflow import logging
 29 | 
 30 | 
 31 | FLAGS = flags.FLAGS
 32 | 
 33 | if __name__ == '__main__':
 34 | 
 35 |   flags.DEFINE_string(
 36 |       "json_prediction_files_pattern", None,
 37 |       "Pattern specifying the list of JSON files that the command "
 38 |       "'gcloud beta ml jobs submit prediction' outputs. These files are "
 39 |       "located in the output path of the prediction command and are prefixed "
 40 |       "with 'prediction.results'.")
 41 |   flags.DEFINE_string(
 42 |       "csv_output_file", None,
 43 |       "The file to save the predictions converted to the CSV format.")
 44 | 
 45 | 
 46 | def get_csv_header():
 47 |   return "VideoId,LabelConfidencePairs\n"
 48 | 
 49 | def to_csv_row(json_data):
 50 | 
 51 |   video_id = json_data["video_id"]
 52 | 
 53 |   class_indexes = json_data["class_indexes"]
 54 |   predictions = json_data["predictions"]
 55 | 
 56 |   if isinstance(video_id, list):
 57 |     video_id = video_id[0]
 58 |     class_indexes = class_indexes[0]
 59 |     predictions = predictions[0]
 60 | 
 61 |   if len(class_indexes) != len(predictions):
 62 |     raise ValueError(
 63 |         "The number of indexes (%s) and predictions (%s) must be equal." 
 64 |         % (len(class_indexes), len(predictions)))
 65 | 
 66 |   return (video_id.decode('utf-8') + "," + " ".join("%i %f" % 
 67 |       (class_indexes[i], predictions[i]) 
 68 |       for i in range(len(class_indexes))) + "\n")
 69 | 
 70 | def main(unused_argv):
 71 |   logging.set_verbosity(tf.logging.INFO)
 72 | 
 73 |   if not FLAGS.json_prediction_files_pattern:
 74 |     raise ValueError(
 75 |         "The flag --json_prediction_files_pattern must be specified.")
 76 | 
 77 |   if not FLAGS.csv_output_file:
 78 |     raise ValueError("The flag --csv_output_file must be specified.")
 79 | 
 80 |   logging.info("Looking for prediction files with pattern: %s", 
 81 |                FLAGS.json_prediction_files_pattern)
 82 | 
 83 |   file_paths = gfile.Glob(FLAGS.json_prediction_files_pattern)  
 84 |   logging.info("Found files: %s", file_paths)
 85 | 
 86 |   logging.info("Writing submission file to: %s", FLAGS.csv_output_file)
 87 |   with gfile.Open(FLAGS.csv_output_file, "w+") as output_file:
 88 |     output_file.write(get_csv_header())
 89 | 
 90 |     for file_path in file_paths:
 91 |       logging.info("processing file: %s", file_path)
 92 | 
 93 |       with gfile.Open(file_path) as input_file:
 94 | 
 95 |         for line in input_file: 
 96 |           json_data = json.loads(line)
 97 |           output_file.write(to_csv_row(json_data))
 98 | 
 99 |     output_file.flush()
100 |   logging.info("done")
101 | 
102 | if __name__ == "__main__":
103 |   app.run()
104 | 


--------------------------------------------------------------------------------
/model_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Contains a collection of util functions for model construction.
16 | """
17 | import numpy
18 | import tensorflow as tf
19 | from tensorflow import logging
20 | from tensorflow import flags
21 | import tensorflow.contrib.slim as slim
22 | 
23 | def SampleRandomSequence(model_input, num_frames, num_samples):
24 |   """Samples a random sequence of frames of size num_samples.
25 | 
26 |   Args:
27 |     model_input: A tensor of size batch_size x max_frames x feature_size
28 |     num_frames: A tensor of size batch_size x 1
29 |     num_samples: A scalar
30 | 
31 |   Returns:
32 |     `model_input`: A tensor of size batch_size x num_samples x feature_size
33 |   """
34 | 
35 |   batch_size = tf.shape(model_input)[0]
36 |   frame_index_offset = tf.tile(
37 |       tf.expand_dims(tf.range(num_samples), 0), [batch_size, 1])
38 |   max_start_frame_index = tf.maximum(num_frames - num_samples, 0)
39 |   start_frame_index = tf.cast(
40 |       tf.multiply(
41 |           tf.random_uniform([batch_size, 1]),
42 |           tf.cast(max_start_frame_index + 1, tf.float32)), tf.int32)
43 |   frame_index = tf.minimum(start_frame_index + frame_index_offset,
44 |                            tf.cast(num_frames - 1, tf.int32))
45 |   batch_index = tf.tile(
46 |       tf.expand_dims(tf.range(batch_size), 1), [1, num_samples])
47 |   index = tf.stack([batch_index, frame_index], 2)
48 |   return tf.gather_nd(model_input, index)
49 | 
50 | 
51 | def SampleRandomFrames(model_input, num_frames, num_samples):
52 |   """Samples a random set of frames of size num_samples.
53 | 
54 |   Args:
55 |     model_input: A tensor of size batch_size x max_frames x feature_size
56 |     num_frames: A tensor of size batch_size x 1
57 |     num_samples: A scalar
58 | 
59 |   Returns:
60 |     `model_input`: A tensor of size batch_size x num_samples x feature_size
61 |   """
62 |   batch_size = tf.shape(model_input)[0]
63 |   frame_index = tf.cast(
64 |       tf.multiply(
65 |           tf.random_uniform([batch_size, num_samples]),
66 |           tf.tile(tf.cast(num_frames, tf.float32), [1, num_samples])), tf.int32)
67 |   batch_index = tf.tile(
68 |       tf.expand_dims(tf.range(batch_size), 1), [1, num_samples])
69 |   index = tf.stack([batch_index, frame_index], 2)
70 |   return tf.gather_nd(model_input, index)
71 | 
72 | def FramePooling(frames, method, **unused_params):
73 |   """Pools over the frames of a video.
74 | 
75 |   Args:
76 |     frames: A tensor with shape [batch_size, num_frames, feature_size].
77 |     method: "average", "max", "attention", or "none".
78 |   Returns:
79 |     A tensor with shape [batch_size, feature_size] for average, max, or
80 |     attention pooling. A tensor with shape [batch_size*num_frames, feature_size]
81 |     for none pooling.
82 | 
83 |   Raises:
84 |     ValueError: if method is other than "average", "max", "attention", or
85 |     "none".
86 |   """
87 |   if method == "average":
88 |     return tf.reduce_mean(frames, 1)
89 |   elif method == "max":
90 |     return tf.reduce_max(frames, 1)
91 |   elif method == "none":
92 |     feature_size = frames.shape_as_list()[2]
93 |     return tf.reshape(frames, [-1, feature_size])
94 |   else:
95 |     raise ValueError("Unrecognized pooling method: %s" % method)
96 | 


--------------------------------------------------------------------------------
/feature_extractor/README.md:
--------------------------------------------------------------------------------
 1 | # YouTube8M Feature Extractor
 2 | This directory contains binary and library code that can extract YouTube8M
 3 | features from images and videos.
 4 | The code requires the Inception TensorFlow model ([tutorial](https://www.tensorflow.org/tutorials/image_recognition)) and our PCA matrix, as
 5 | outlined in Section 3.3 of our [paper](https://arxiv.org/abs/1609.08675). The
 6 | first time you use our code, it will **automatically** download the inception
 7 | model (75 Megabytes, tensorflow [GraphDef proto](https://www.tensorflow.org/api_docs/python/tf/GraphDef),
 8 | [download link](http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz))
 9 | and the PCA matrix (25 Megabytes, Numpy arrays,
10 | [download link](http://data.yt8m.org/yt8m_pca.tgz)).
11 | 
12 | ## Usage
13 | 
14 | There are two ways to use this code:
15 | 
16 |  1. Binary `extract_tfrecords_main.py` processes a CSV file of videos (and their
17 |     labels) and outputs `tfrecord` file. Files created with this binary match
18 |     the schema of YouTube-8M dataset files, and are therefore are compatible
19 |     with our training starter code. You can also use the file for inference
20 |     using your models that are pre-trained on YouTube-8M.
21 |  1. Library `feature_extractor.py` which can extract features from images.
22 | 
23 | 
24 | ### Using the Binary to create `tfrecords` from videos
25 | 
26 | You can use binary `extract_tfrecords_main.py` to create `tfrecord` files.
27 | However, this binary assumes that you have OpenCV properly installed (see end
28 | of subsection). Assume that you have two videos `/path/to/vid1` and
29 | `/path/to/vid2`, respectively, with multi-integer labels of `(52, 3, 10)` and
30 | `(7, 67)`. To create `tfrecord` containing features and labels for those videos,
31 | you must first create a CSV file (e.g. on `/path/to/vid_dataset.csv`) with
32 | contents:
33 | 
34 |     /path/to/vid1,52;3;10
35 |     /path/to/vid2,7;67
36 | 
37 | Note that the CSV is comma-separated but the label-field is semi-colon separated
38 | to allow for multiple labels per video.
39 | 
40 | Then, you can create the `tfrecord` by calling the binary:
41 | 
42 |     python extract_tfrecords_main.py --input /path/to/vid_dataset.csv \
43 |         --output_tfrecords_file /path/to/output.tfrecord
44 | 
45 | Now, you can use the output file for training and/or inference using our starter
46 | code.
47 | 
48 | `extract_tfrecords_main.py` requires OpenCV python bindings to be
49 | installed and linked with ffmpeg. In other words, running this command should
50 | print `True`:
51 | 
52 |     python -c 'import cv2; print cv2.VideoCapture().open("/path/to/some/video.mp4")'
53 | 
54 | 
55 | ### Using the library to extract features from images
56 | 
57 | To extract our features from an image file `cropped_panda.jpg`, you can use
58 | this python code:
59 | 
60 | ```python
61 | from PIL import Image
62 | import numpy
63 | 
64 | # Instantiate extractor. Slow if called first time on your machine, as it
65 | # needs to download 100 MB.
66 | extractor = YouTube8MFeatureExtractor()
67 | 
68 | image_file = os.path.join(extractor._model_dir, 'cropped_panda.jpg')
69 | 
70 | im = numpy.array(Image.open(image_file))
71 | features = extractor.extract_rgb_frame_features(im)
72 | ```
73 | 
74 | The constructor `extractor = YouTube8MFeatureExtractor()` will create a
75 | directory `~/yt8m/`, if it does not exist, and will download and untar the two
76 | model files (inception and PCA matrix). If you prefer, you can point our
77 | extractor to another directory as:
78 | 
79 | ```python
80 | extractor = YouTube8MFeatureExtractor(model_dir="/path/to/yt8m_files")
81 | ```
82 | 
83 | You can also pre-populate your custom `"/path/to/yt8m_files"` by manually
84 | downloading (e.g. using `wget`) the URLs and un-tarring them, for example:
85 | 
86 | ```bash
87 | mkdir -p /path/to/yt8m_files
88 | cd /path/to/yt8m_files
89 | 
90 | wget http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
91 | wget http://data.yt8m.org/yt8m_pca.tgz
92 | 
93 | tar zxvf inception-2015-12-05.tgz
94 | tar zxvf yt8m_pca.tgz
95 | ```
96 | 


--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Provides definitions for non-regularized training or test losses."""
16 | 
17 | import tensorflow as tf
18 | 
19 | 
20 | class BaseLoss(object):
21 |   """Inherit from this class when implementing new losses."""
22 | 
23 |   def calculate_loss(self, unused_predictions, unused_labels, **unused_params):
24 |     """Calculates the average loss of the examples in a mini-batch.
25 | 
26 |      Args:
27 |       unused_predictions: a 2-d tensor storing the prediction scores, in which
28 |         each row represents a sample in the mini-batch and each column
29 |         represents a class.
30 |       unused_labels: a 2-d tensor storing the labels, which has the same shape
31 |         as the unused_predictions. The labels must be in the range of 0 and 1.
32 |       unused_params: loss specific parameters.
33 | 
34 |     Returns:
35 |       A scalar loss tensor.
36 |     """
37 |     raise NotImplementedError()
38 | 
39 | 
40 | class CrossEntropyLoss(BaseLoss):
41 |   """Calculate the cross entropy loss between the predictions and labels.
42 |   """
43 | 
44 |   def calculate_loss(self, predictions, labels, **unused_params):
45 |     with tf.name_scope("loss_xent"):
46 |       epsilon = 10e-6
47 |       float_labels = tf.cast(labels, tf.float32)
48 |       cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + (
49 |           1 - float_labels) * tf.log(1 - predictions + epsilon)
50 |       cross_entropy_loss = tf.negative(cross_entropy_loss)
51 |       return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1))
52 | 
53 | 
54 | class HingeLoss(BaseLoss):
55 |   """Calculate the hinge loss between the predictions and labels.
56 | 
57 |   Note the subgradient is used in the backpropagation, and thus the optimization
58 |   may converge slower. The predictions trained by the hinge loss are between -1
59 |   and +1.
60 |   """
61 | 
62 |   def calculate_loss(self, predictions, labels, b=1.0, **unused_params):
63 |     with tf.name_scope("loss_hinge"):
64 |       float_labels = tf.cast(labels, tf.float32)
65 |       all_zeros = tf.zeros(tf.shape(float_labels), dtype=tf.float32)
66 |       all_ones = tf.ones(tf.shape(float_labels), dtype=tf.float32)
67 |       sign_labels = tf.subtract(tf.scalar_mul(2, float_labels), all_ones)
68 |       hinge_loss = tf.maximum(
69 |           all_zeros, tf.scalar_mul(b, all_ones) - sign_labels * predictions)
70 |       return tf.reduce_mean(tf.reduce_sum(hinge_loss, 1))
71 | 
72 | 
73 | class SoftmaxLoss(BaseLoss):
74 |   """Calculate the softmax loss between the predictions and labels.
75 | 
76 |   The function calculates the loss in the following way: first we feed the
77 |   predictions to the softmax activation function and then we calculate
78 |   the minus linear dot product between the logged softmax activations and the
79 |   normalized ground truth label.
80 | 
81 |   It is an extension to the one-hot label. It allows for more than one positive
82 |   labels for each sample.
83 |   """
84 | 
85 |   def calculate_loss(self, predictions, labels, **unused_params):
86 |     with tf.name_scope("loss_softmax"):
87 |       epsilon = 10e-8
88 |       float_labels = tf.cast(labels, tf.float32)
89 |       # l1 normalization (labels are no less than 0)
90 |       label_rowsum = tf.maximum(
91 |           tf.reduce_sum(float_labels, 1, keep_dims=True),
92 |           epsilon)
93 |       norm_float_labels = tf.div(float_labels, label_rowsum)
94 |       softmax_outputs = tf.nn.softmax(predictions)
95 |       softmax_loss = tf.negative(tf.reduce_sum(
96 |           tf.multiply(norm_float_labels, tf.log(softmax_outputs)), 1))
97 |     return tf.reduce_mean(softmax_loss)
98 | 


--------------------------------------------------------------------------------
/video_level_models.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains model definitions."""
 16 | import math
 17 | 
 18 | import models
 19 | import tensorflow as tf
 20 | import utils
 21 | 
 22 | from tensorflow import flags
 23 | import tensorflow.contrib.slim as slim
 24 | 
 25 | FLAGS = flags.FLAGS
 26 | flags.DEFINE_integer(
 27 |     "moe_num_mixtures", 2,
 28 |     "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
 29 | 
 30 | class LogisticModel(models.BaseModel):
 31 |   """Logistic model with L2 regularization."""
 32 | 
 33 |   def create_model(self, model_input, vocab_size, l2_penalty=1e-8, **unused_params):
 34 |     """Creates a logistic model.
 35 | 
 36 |     Args:
 37 |       model_input: 'batch' x 'num_features' matrix of input features.
 38 |       vocab_size: The number of classes in the dataset.
 39 | 
 40 |     Returns:
 41 |       A dictionary with a tensor containing the probability predictions of the
 42 |       model in the 'predictions' key. The dimensions of the tensor are
 43 |       batch_size x num_classes."""
 44 |     output = slim.fully_connected(
 45 |         model_input, vocab_size, activation_fn=tf.nn.sigmoid,
 46 |         weights_regularizer=slim.l2_regularizer(l2_penalty))
 47 |     return {"predictions": output}
 48 | 
 49 | class MoeModel(models.BaseModel):
 50 |   """A softmax over a mixture of logistic models (with L2 regularization)."""
 51 | 
 52 |   def create_model(self,
 53 |                    model_input,
 54 |                    vocab_size,
 55 |                    num_mixtures=None,
 56 |                    l2_penalty=1e-8,
 57 |                    **unused_params):
 58 |     """Creates a Mixture of (Logistic) Experts model.
 59 | 
 60 |      The model consists of a per-class softmax distribution over a
 61 |      configurable number of logistic classifiers. One of the classifiers in the
 62 |      mixture is not trained, and always predicts 0.
 63 | 
 64 |     Args:
 65 |       model_input: 'batch_size' x 'num_features' matrix of input features.
 66 |       vocab_size: The number of classes in the dataset.
 67 |       num_mixtures: The number of mixtures (excluding a dummy 'expert' that
 68 |         always predicts the non-existence of an entity).
 69 |       l2_penalty: How much to penalize the squared magnitudes of parameter
 70 |         values.
 71 |     Returns:
 72 |       A dictionary with a tensor containing the probability predictions of the
 73 |       model in the 'predictions' key. The dimensions of the tensor are
 74 |       batch_size x num_classes.
 75 |     """
 76 |     num_mixtures = num_mixtures or FLAGS.moe_num_mixtures
 77 | 
 78 |     gate_activations = slim.fully_connected(
 79 |         model_input,
 80 |         vocab_size * (num_mixtures + 1),
 81 |         activation_fn=None,
 82 |         biases_initializer=None,
 83 |         weights_regularizer=slim.l2_regularizer(l2_penalty),
 84 |         scope="gates")
 85 |     expert_activations = slim.fully_connected(
 86 |         model_input,
 87 |         vocab_size * num_mixtures,
 88 |         activation_fn=None,
 89 |         weights_regularizer=slim.l2_regularizer(l2_penalty),
 90 |         scope="experts")
 91 | 
 92 |     gating_distribution = tf.nn.softmax(tf.reshape(
 93 |         gate_activations,
 94 |         [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
 95 |     expert_distribution = tf.nn.sigmoid(tf.reshape(
 96 |         expert_activations,
 97 |         [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
 98 | 
 99 |     final_probabilities_by_class_and_batch = tf.reduce_sum(
100 |         gating_distribution[:, :num_mixtures] * expert_distribution, 1)
101 |     final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
102 |                                      [-1, vocab_size])
103 |     return {"predictions": final_probabilities}
104 | 


--------------------------------------------------------------------------------
/mean_average_precision_calculator.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Calculate the mean average precision.
 16 | 
 17 | It provides an interface for calculating mean average precision
 18 | for an entire list or the top-n ranked items.
 19 | 
 20 | Example usages:
 21 | We first call the function accumulate many times to process parts of the ranked
 22 | list. After processing all the parts, we call peek_map_at_n
 23 | to calculate the mean average precision.
 24 | 
 25 | ```
 26 | import random
 27 | 
 28 | p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])
 29 | a = np.array([[random.choice([0, 1]) for _ in xrange(50)]
 30 |      for _ in xrange(1000)])
 31 | 
 32 | # mean average precision for 50 classes.
 33 | calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(
 34 |             num_class=50)
 35 | calculator.accumulate(p, a)
 36 | aps = calculator.peek_map_at_n()
 37 | ```
 38 | """
 39 | 
 40 | import numpy
 41 | import average_precision_calculator
 42 | 
 43 | 
 44 | class MeanAveragePrecisionCalculator(object):
 45 |   """This class is to calculate mean average precision.
 46 |   """
 47 | 
 48 |   def __init__(self, num_class):
 49 |     """Construct a calculator to calculate the (macro) average precision.
 50 | 
 51 |     Args:
 52 |       num_class: A positive Integer specifying the number of classes.
 53 |       top_n_array: A list of positive integers specifying the top n for each
 54 |       class. The top n in each class will be used to calculate its average
 55 |       precision at n.
 56 |       The size of the array must be num_class.
 57 | 
 58 |     Raises:
 59 |       ValueError: An error occurred when num_class is not a positive integer;
 60 |       or the top_n_array is not a list of positive integers.
 61 |     """
 62 |     if not isinstance(num_class, int) or num_class <= 1:
 63 |       raise ValueError("num_class must be a positive integer.")
 64 | 
 65 |     self._ap_calculators = []  # member of AveragePrecisionCalculator
 66 |     self._num_class = num_class  # total number of classes
 67 |     for i in range(num_class):
 68 |       self._ap_calculators.append(
 69 |           average_precision_calculator.AveragePrecisionCalculator())
 70 | 
 71 |   def accumulate(self, predictions, actuals, num_positives=None):
 72 |     """Accumulate the predictions and their ground truth labels.
 73 | 
 74 |     Args:
 75 |       predictions: A list of lists storing the prediction scores. The outer
 76 |       dimension corresponds to classes.
 77 |       actuals: A list of lists storing the ground truth labels. The dimensions
 78 |       should correspond to the predictions input. Any value
 79 |       larger than 0 will be treated as positives, otherwise as negatives.
 80 |       num_positives: If provided, it is a list of numbers representing the
 81 |       number of true positives for each class. If not provided, the number of
 82 |       true positives will be inferred from the 'actuals' array.
 83 | 
 84 |     Raises:
 85 |       ValueError: An error occurred when the shape of predictions and actuals
 86 |       does not match.
 87 |     """
 88 |     if not num_positives:
 89 |       num_positives = [None for i in predictions.shape[1]]
 90 | 
 91 |     calculators = self._ap_calculators
 92 |     for i in range(len(predictions)):
 93 |       calculators[i].accumulate(predictions[i], actuals[i], num_positives[i])
 94 | 
 95 |   def clear(self):
 96 |     for calculator in self._ap_calculators:
 97 |       calculator.clear()
 98 | 
 99 |   def is_empty(self):
100 |     return ([calculator.heap_size for calculator in self._ap_calculators] ==
101 |             [0 for _ in range(self._num_class)])
102 | 
103 |   def peek_map_at_n(self):
104 |     """Peek the non-interpolated mean average precision at n.
105 | 
106 |     Returns:
107 |       An array of non-interpolated average precision at n (default 0) for each
108 |       class.
109 |     """
110 |     aps = [self._ap_calculators[i].peek_ap_at_n()
111 |            for i in range(self._num_class)]
112 |     return aps
113 | 


--------------------------------------------------------------------------------
/export_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Utilities to export a model for batch prediction."""
 15 | 
 16 | import tensorflow as tf
 17 | import tensorflow.contrib.slim as slim
 18 | 
 19 | from tensorflow.python.saved_model import builder as saved_model_builder
 20 | from tensorflow.python.saved_model import signature_constants
 21 | from tensorflow.python.saved_model import signature_def_utils
 22 | from tensorflow.python.saved_model import tag_constants
 23 | from tensorflow.python.saved_model import utils as saved_model_utils
 24 | 
 25 | _TOP_PREDICTIONS_IN_OUTPUT = 20
 26 | 
 27 | class ModelExporter(object):
 28 | 
 29 |   def __init__(self, frame_features, model, reader):
 30 |     self.frame_features = frame_features
 31 |     self.model = model
 32 |     self.reader = reader
 33 | 
 34 |     with tf.Graph().as_default() as graph:
 35 |       self.inputs, self.outputs = self.build_inputs_and_outputs()
 36 |       self.graph = graph
 37 |       self.saver = tf.train.Saver(tf.trainable_variables(), sharded=True)
 38 | 
 39 |   def export_model(self, model_dir, global_step_val, last_checkpoint):
 40 |     """Exports the model so that it can used for batch predictions."""
 41 | 
 42 |     with self.graph.as_default():
 43 |       with tf.Session() as session:
 44 |         session.run(tf.global_variables_initializer())
 45 |         self.saver.restore(session, last_checkpoint)
 46 | 
 47 |         signature = signature_def_utils.build_signature_def(
 48 |             inputs=self.inputs,
 49 |             outputs=self.outputs,
 50 |             method_name=signature_constants.PREDICT_METHOD_NAME)
 51 | 
 52 |         signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
 53 |                          signature}
 54 | 
 55 |         model_builder = saved_model_builder.SavedModelBuilder(model_dir)
 56 |         model_builder.add_meta_graph_and_variables(session,
 57 |             tags=[tag_constants.SERVING],
 58 |             signature_def_map=signature_map,
 59 |             clear_devices=True)
 60 |         model_builder.save()
 61 | 
 62 |   def build_inputs_and_outputs(self):
 63 |     if self.frame_features:
 64 |       serialized_examples = tf.placeholder(tf.string, shape=(None,))
 65 | 
 66 |       fn = lambda x: self.build_prediction_graph(x)
 67 |       video_id_output, top_indices_output, top_predictions_output = (
 68 |           tf.map_fn(fn, serialized_examples,
 69 |                     dtype=(tf.string, tf.int32, tf.float32)))
 70 | 
 71 |     else:
 72 |       serialized_examples = tf.placeholder(tf.string, shape=(None,))
 73 | 
 74 |       video_id_output, top_indices_output, top_predictions_output = (
 75 |           self.build_prediction_graph(serialized_examples))
 76 | 
 77 |     inputs = {"example_bytes":
 78 |               saved_model_utils.build_tensor_info(serialized_examples)}
 79 | 
 80 |     outputs = {
 81 |         "video_id": saved_model_utils.build_tensor_info(video_id_output),
 82 |         "class_indexes": saved_model_utils.build_tensor_info(top_indices_output),
 83 |         "predictions": saved_model_utils.build_tensor_info(top_predictions_output)}
 84 | 
 85 |     return inputs, outputs
 86 | 
 87 |   def build_prediction_graph(self, serialized_examples):
 88 |     video_id, model_input_raw, labels_batch, num_frames = (
 89 |         self.reader.prepare_serialized_examples(serialized_examples))
 90 | 
 91 |     feature_dim = len(model_input_raw.get_shape()) - 1
 92 |     model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
 93 | 
 94 |     with tf.variable_scope("tower"):
 95 |       result = self.model.create_model(
 96 |           model_input,
 97 |           num_frames=num_frames,
 98 |           vocab_size=self.reader.num_classes,
 99 |           labels=labels_batch,
100 |           is_training=False)
101 | 
102 |       for variable in slim.get_model_variables():
103 |         tf.summary.histogram(variable.op.name, variable)
104 | 
105 |       predictions = result["predictions"]
106 | 
107 |       top_predictions, top_indices = tf.nn.top_k(predictions,
108 |           _TOP_PREDICTIONS_IN_OUTPUT)
109 |     return video_id, top_indices, top_predictions
110 | 


--------------------------------------------------------------------------------
/feature_extractor/feature_extractor.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Facilitates extracting YouTube8M features from RGB images."""
 15 | 
 16 | import os
 17 | import sys
 18 | import tarfile
 19 | import numpy
 20 | from six.moves import urllib
 21 | import tensorflow as tf
 22 | 
 23 | INCEPTION_TF_GRAPH = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
 24 | YT8M_PCA_MAT = 'http://data.yt8m.org/yt8m_pca.tgz'
 25 | MODEL_DIR = os.path.join(os.getenv('HOME'), 'yt8m')
 26 | 
 27 | 
 28 | class YouTube8MFeatureExtractor(object):
 29 |   """Extracts YouTube8M features for RGB frames.
 30 | 
 31 |   First time constructing this class will create directory `yt8m` inside your
 32 |   home directory, and will download inception model (85 MB) and YouTube8M PCA
 33 |   matrix (15 MB). If you want to use another directory, then pass it to argument
 34 |   `model_dir` of constructor.
 35 | 
 36 |   If the model_dir exist and contains the necessary files, then files will be
 37 |   re-used without download.
 38 | 
 39 |   Usage Example:
 40 | 
 41 |       from PIL import Image
 42 |       import numpy
 43 | 
 44 |       # Instantiate extractor. Slow if called first time on your machine, as it
 45 |       # needs to download 100 MB.
 46 |       extractor = YouTube8MFeatureExtractor()
 47 | 
 48 |       image_file = os.path.join(extractor._model_dir, 'cropped_panda.jpg')
 49 | 
 50 |       im = numpy.array(Image.open(image_file))
 51 |       features = extractor.extract_rgb_frame_features(im)
 52 | 
 53 |   ** Note: OpenCV reverses the order of channels (i.e. orders channels as BGR
 54 |   instead of RGB). If you are using OpenCV, then you must do:
 55 | 
 56 |       im = im[:, :, ::-1]  # Reverses order on last (i.e. channel) dimension.
 57 | 
 58 |   then call `extractor.extract_rgb_frame_features(im)`
 59 |   """
 60 | 
 61 |   def __init__(self, model_dir=MODEL_DIR):
 62 |     # Create MODEL_DIR if not created.
 63 |     self._model_dir = model_dir
 64 |     if not os.path.exists(model_dir):
 65 |       os.makedirs(model_dir)
 66 | 
 67 |     # Load Inception Network
 68 |     download_path = self._maybe_download(INCEPTION_TF_GRAPH)
 69 |     inception_proto_file = os.path.join(
 70 |         self._model_dir, 'classify_image_graph_def.pb')
 71 |     if not os.path.exists(inception_proto_file):
 72 |       tarfile.open(download_path, 'r:gz').extractall(model_dir)
 73 |     self._load_inception(inception_proto_file)
 74 | 
 75 |     # Load PCA Matrix.
 76 |     download_path = self._maybe_download(YT8M_PCA_MAT)
 77 |     pca_mean = os.path.join(self._model_dir, 'mean.npy')
 78 |     if not os.path.exists(pca_mean):
 79 |       tarfile.open(download_path, 'r:gz').extractall(model_dir)
 80 |     self._load_pca()
 81 | 
 82 |   def extract_rgb_frame_features(self, frame_rgb, apply_pca=True):
 83 |     """Applies the YouTube8M feature extraction over an RGB frame.
 84 | 
 85 |     This passes `frame_rgb` to inception3 model, extracting hidden layer
 86 |     activations and passing it to the YouTube8M PCA transformation.
 87 | 
 88 |     Args:
 89 |       frame_rgb: numpy array of uint8 with shape (height, width, channels) where
 90 |         channels must be 3 (RGB), and height and weight can be anything, as the
 91 |         inception model will resize.
 92 |       apply_pca: If not set, PCA transformation will be skipped.
 93 | 
 94 |     Returns:
 95 |       Output of inception from `frame_rgb` (2048-D) and optionally passed into
 96 |       YouTube8M PCA transformation (1024-D).
 97 |     """
 98 |     assert len(frame_rgb.shape) == 3
 99 |     assert frame_rgb.shape[2] == 3  # 3 channels (R, G, B)
100 |     with self._inception_graph.as_default():
101 |       frame_features = self.session.run('pool_3/_reshape:0',
102 |                                         feed_dict={'DecodeJpeg:0': frame_rgb})
103 |       frame_features = frame_features[0]  # Unbatch.
104 | 
105 |     if apply_pca:
106 |       frame_features = self.apply_pca(frame_features)
107 | 
108 |     return frame_features
109 | 
110 |   def apply_pca(self, frame_features):
111 |     """Applies the YouTube8M PCA Transformation over `frame_features`.
112 | 
113 |     Args:
114 |       frame_features: numpy array of floats, 2048 dimensional vector.
115 | 
116 |     Returns:
117 |       1024 dimensional vector as a numpy array.
118 |     """
119 |     # Subtract mean
120 |     feats = frame_features - self.pca_mean
121 | 
122 |     # Multiply by eigenvectors.
123 |     feats = feats.reshape((1, 2048)).dot(self.pca_eigenvecs).reshape((1024,))
124 | 
125 |     # Whiten
126 |     feats /= numpy.sqrt(self.pca_eigenvals + 1e-4)
127 |     return feats
128 | 
129 |   def _maybe_download(self, url):
130 |     """Downloads `url` if not in `_model_dir`."""
131 |     filename = os.path.basename(url)
132 |     download_path = os.path.join(self._model_dir, filename)
133 |     if os.path.exists(download_path):
134 |       return download_path
135 | 
136 |     def _progress(count, block_size, total_size):
137 |       sys.stdout.write('\r>> Downloading %s %.1f%%' % (
138 |           filename, float(count * block_size) / float(total_size) * 100.0))
139 |       sys.stdout.flush()
140 |     urllib.request.urlretrieve(url, download_path, _progress)
141 |     statinfo = os.stat(download_path)
142 |     print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
143 |     return download_path
144 | 
145 |   def _load_inception(self, proto_file):
146 |     graph_def = tf.GraphDef.FromString(open(proto_file, 'rb').read())
147 |     self._inception_graph = tf.Graph()
148 |     with self._inception_graph.as_default():
149 |       _ = tf.import_graph_def(graph_def, name='')
150 |       self.session = tf.Session()
151 | 
152 |   def _load_pca(self):
153 |     self.pca_mean = numpy.load(
154 |         os.path.join(self._model_dir, 'mean.npy'))[:, 0]
155 |     self.pca_eigenvals = numpy.load(
156 |         os.path.join(self._model_dir, 'eigenvals.npy'))[:1024, 0]
157 |     self.pca_eigenvecs = numpy.load(
158 |         os.path.join(self._model_dir, 'eigenvecs.npy')).T[:, :1024]
159 | 


--------------------------------------------------------------------------------
/feature_extractor/extract_tfrecords_main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Produces tfrecord files similar to the YouTube-8M dataset.
 15 | 
 16 | It processes a CSV file containing lines like "<video_file>,<labels>", where
 17 | <video_file> must be a path of a video, and <labels> must be an integer list
 18 | joined with semi-colon ";". It processes all videos and outputs tfrecord file
 19 | onto --output_tfrecords_file.
 20 | 
 21 | It assumes that you have OpenCV installed and properly linked with ffmpeg (i.e.
 22 | function `cv2.VideoCapture().open('/path/to/some/video')` should return True).
 23 | 
 24 | The binary only processes the video stream (images) and not the audio stream.
 25 | """
 26 | 
 27 | import csv
 28 | import os
 29 | import sys
 30 | 
 31 | import cv2
 32 | import feature_extractor
 33 | import numpy
 34 | import tensorflow as tf
 35 | from tensorflow import app
 36 | from tensorflow import flags
 37 | 
 38 | FLAGS = flags.FLAGS
 39 | 
 40 | # In OpenCV3.X, this is available as cv2.CAP_PROP_POS_MSEC
 41 | # In OpenCV2.X, this is available as cv2.cv.CV_CAP_PROP_POS_MSEC
 42 | CAP_PROP_POS_MSEC = 0
 43 | 
 44 | if __name__ == '__main__':
 45 |   # Required flags for input and output.
 46 |   flags.DEFINE_string('output_tfrecords_file', None,
 47 |                       'File containing tfrecords will be written at this path.')
 48 |   flags.DEFINE_string('input_videos_csv', None,
 49 |                       'CSV file with lines "<video_file>,<labels>", where '
 50 |                       '<video_file> must be a path of a video and <labels> '
 51 |                       'must be an integer list joined with semi-colon ";"')
 52 |   # Optional flags.
 53 |   flags.DEFINE_string('model_dir', os.path.join(os.getenv('HOME'), 'yt8m'),
 54 |                       'Directory to store model files. It defaults to ~/yt8m')
 55 | 
 56 |   # The following flags are set to match the YouTube-8M dataset format.
 57 |   flags.DEFINE_integer('frames_per_second', 1,
 58 |                        'This many frames per second will be processed')
 59 |   flags.DEFINE_string('labels_feature_key', 'labels',
 60 |                       'Labels will be written to context feature with this '
 61 |                       'key, as int64 list feature.')
 62 |   flags.DEFINE_string('image_feature_key', 'rgb',
 63 |                       'Image features will be written to sequence feature with '
 64 |                       'this key, as bytes list feature, with only one entry, '
 65 |                       'containing quantized feature string.')
 66 |   flags.DEFINE_string('video_file_key_feature_key', 'video_id',
 67 |                       'Input <video_file> will be written to context feature '
 68 |                       'with this key, as bytes list feature, with only one '
 69 |                       'entry, containing the file path of the video. This '
 70 |                       'can be used for debugging but not for training or eval.')
 71 |   flags.DEFINE_boolean('insert_zero_audio_features', True,
 72 |                        'If set, inserts features with name "audio" to be 128-D '
 73 |                        'zero vectors. This allows you to use YouTube-8M '
 74 |                        'pre-trained model.')
 75 | 
 76 | 
 77 | def frame_iterator(filename, every_ms=1000, max_num_frames=300):
 78 |   """Uses OpenCV to iterate over all frames of filename at a given frequency.
 79 | 
 80 |   Args:
 81 |     filename: Path to video file (e.g. mp4)
 82 |     every_ms: The duration (in milliseconds) to skip between frames.
 83 |     max_num_frames: Maximum number of frames to process, taken from the
 84 |       beginning of the video.
 85 | 
 86 |   Yields:
 87 |     RGB frame with shape (image height, image width, channels)
 88 |   """
 89 |   video_capture = cv2.VideoCapture()
 90 |   if not video_capture.open(filename):
 91 |     print >> sys.stderr, 'Error: Cannot open video file ' + filename
 92 |     return
 93 |   last_ts = -99999  # The timestamp of last retrieved frame.
 94 |   num_retrieved = 0
 95 | 
 96 |   while num_retrieved < max_num_frames:
 97 |     # Skip frames
 98 |     while video_capture.get(CAP_PROP_POS_MSEC) < every_ms + last_ts:
 99 |       if not video_capture.read()[0]:
100 |         return
101 | 
102 |     last_ts = video_capture.get(CAP_PROP_POS_MSEC)
103 |     has_frames, frame = video_capture.read()
104 |     if not has_frames:
105 |       break
106 |     yield frame
107 |     num_retrieved += 1
108 | 
109 | 
110 | def _int64_list_feature(int64_list):
111 |   return tf.train.Feature(int64_list=tf.train.Int64List(value=int64_list))
112 | 
113 | 
114 | def _bytes_feature(value):
115 |   return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
116 | 
117 | 
118 | def _make_bytes(int_array):
119 |   if bytes == str:  # Python2
120 |     return ''.join(map(chr, int_array))
121 |   else:
122 |     return bytes(int_array)
123 | 
124 | 
125 | def quantize(features, min_quantized_value=-2.0, max_quantized_value=2.0):
126 |   """Quantizes float32 `features` into string."""
127 |   assert features.dtype == 'float32'
128 |   assert len(features.shape) == 1  # 1-D array
129 |   features = numpy.clip(features, min_quantized_value, max_quantized_value)
130 |   quantize_range = max_quantized_value - min_quantized_value
131 |   features = (features - min_quantized_value) * (255.0 / quantize_range)
132 |   features = [int(round(f)) for f in features]
133 | 
134 |   return _make_bytes(features)
135 | 
136 | 
137 | def main(unused_argv):
138 |   extractor = feature_extractor.YouTube8MFeatureExtractor(FLAGS.model_dir)
139 |   writer = tf.python_io.TFRecordWriter(FLAGS.output_tfrecords_file)
140 |   total_written = 0
141 |   total_error = 0
142 |   for video_file, labels in csv.reader(open(FLAGS.input_videos_csv)):
143 |     rgb_features = []
144 |     for rgb in frame_iterator(
145 |         video_file, every_ms=1000.0/FLAGS.frames_per_second):
146 |       features = extractor.extract_rgb_frame_features(rgb[:, :, ::-1])
147 |       rgb_features.append(_bytes_feature(quantize(features)))
148 | 
149 |     if not rgb_features:
150 |       print >> sys.stderr, 'Could not get features for ' + video_file
151 |       total_error += 1
152 |       continue
153 | 
154 |     # Create SequenceExample proto and write to output.
155 |     feature_list = {
156 |         FLAGS.image_feature_key: tf.train.FeatureList(feature=rgb_features),
157 |     }
158 |     if FLAGS.insert_zero_audio_features:
159 |       feature_list['audio'] = tf.train.FeatureList(
160 |           feature=[_bytes_feature(_make_bytes([0] * 128))] * len(rgb_features))
161 | 
162 |     example = tf.train.SequenceExample(
163 |         context=tf.train.Features(feature={
164 |             FLAGS.labels_feature_key:
165 |                 _int64_list_feature(sorted(map(int, labels.split(';')))),
166 |             FLAGS.video_file_key_feature_key:
167 |                 _bytes_feature(_make_bytes(map(ord, video_file))),
168 |         }),
169 |         feature_lists=tf.train.FeatureLists(feature_list=feature_list))
170 |     writer.write(example.SerializeToString())
171 |     total_written += 1
172 | 
173 |   writer.close()
174 |   print('Successfully encoded %i out of %i videos' % (
175 |       total_written, total_written + total_error))
176 | 
177 | 
178 | if __name__ == '__main__':
179 |   app.run(main)
180 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains a collection of util functions for training and evaluating.
 16 | """
 17 | 
 18 | import numpy
 19 | import tensorflow as tf
 20 | from tensorflow import logging
 21 | 
 22 | try:
 23 |   xrange          # Python 2
 24 | except NameError:
 25 |   xrange = range  # Python 3
 26 | 
 27 | 
 28 | def Dequantize(feat_vector, max_quantized_value=2, min_quantized_value=-2):
 29 |   """Dequantize the feature from the byte format to the float format.
 30 | 
 31 |   Args:
 32 |     feat_vector: the input 1-d vector.
 33 |     max_quantized_value: the maximum of the quantized value.
 34 |     min_quantized_value: the minimum of the quantized value.
 35 | 
 36 |   Returns:
 37 |     A float vector which has the same shape as feat_vector.
 38 |   """
 39 |   assert max_quantized_value > min_quantized_value
 40 |   quantized_range = max_quantized_value - min_quantized_value
 41 |   scalar = quantized_range / 255.0
 42 |   bias = (quantized_range / 512.0) + min_quantized_value
 43 |   return feat_vector * scalar + bias
 44 | 
 45 | 
 46 | def MakeSummary(name, value):
 47 |   """Creates a tf.Summary proto with the given name and value."""
 48 |   summary = tf.Summary()
 49 |   val = summary.value.add()
 50 |   val.tag = str(name)
 51 |   val.simple_value = float(value)
 52 |   return summary
 53 | 
 54 | 
 55 | def AddGlobalStepSummary(summary_writer,
 56 |                          global_step_val,
 57 |                          global_step_info_dict,
 58 |                          summary_scope="Eval"):
 59 |   """Add the global_step summary to the Tensorboard.
 60 | 
 61 |   Args:
 62 |     summary_writer: Tensorflow summary_writer.
 63 |     global_step_val: a int value of the global step.
 64 |     global_step_info_dict: a dictionary of the evaluation metrics calculated for
 65 |       a mini-batch.
 66 |     summary_scope: Train or Eval.
 67 | 
 68 |   Returns:
 69 |     A string of this global_step summary
 70 |   """
 71 |   this_hit_at_one = global_step_info_dict["hit_at_one"]
 72 |   this_perr = global_step_info_dict["perr"]
 73 |   this_loss = global_step_info_dict["loss"]
 74 |   examples_per_second = global_step_info_dict.get("examples_per_second", -1)
 75 | 
 76 |   summary_writer.add_summary(
 77 |       MakeSummary("GlobalStep/" + summary_scope + "_Hit@1", this_hit_at_one),
 78 |       global_step_val)
 79 |   summary_writer.add_summary(
 80 |       MakeSummary("GlobalStep/" + summary_scope + "_Perr", this_perr),
 81 |       global_step_val)
 82 |   summary_writer.add_summary(
 83 |       MakeSummary("GlobalStep/" + summary_scope + "_Loss", this_loss),
 84 |       global_step_val)
 85 | 
 86 |   if examples_per_second != -1:
 87 |     summary_writer.add_summary(
 88 |         MakeSummary("GlobalStep/" + summary_scope + "_Example_Second",
 89 |                     examples_per_second), global_step_val)
 90 | 
 91 |   summary_writer.flush()
 92 |   info = ("global_step {0} | Batch Hit@1: {1:.3f} | Batch PERR: {2:.3f} | Batch Loss: {3:.3f} "
 93 |           "| Examples_per_sec: {4:.3f}").format(
 94 |               global_step_val, this_hit_at_one, this_perr, this_loss,
 95 |               examples_per_second)
 96 |   return info
 97 | 
 98 | 
 99 | def AddEpochSummary(summary_writer,
100 |                     global_step_val,
101 |                     epoch_info_dict,
102 |                     summary_scope="Eval"):
103 |   """Add the epoch summary to the Tensorboard.
104 | 
105 |   Args:
106 |     summary_writer: Tensorflow summary_writer.
107 |     global_step_val: a int value of the global step.
108 |     epoch_info_dict: a dictionary of the evaluation metrics calculated for the
109 |       whole epoch.
110 |     summary_scope: Train or Eval.
111 | 
112 |   Returns:
113 |     A string of this global_step summary
114 |   """
115 |   epoch_id = epoch_info_dict["epoch_id"]
116 |   avg_hit_at_one = epoch_info_dict["avg_hit_at_one"]
117 |   avg_perr = epoch_info_dict["avg_perr"]
118 |   avg_loss = epoch_info_dict["avg_loss"]
119 |   aps = epoch_info_dict["aps"]
120 |   gap = epoch_info_dict["gap"]
121 |   mean_ap = numpy.mean(aps)
122 | 
123 |   summary_writer.add_summary(
124 |       MakeSummary("Epoch/" + summary_scope + "_Avg_Hit@1", avg_hit_at_one),
125 |       global_step_val)
126 |   summary_writer.add_summary(
127 |       MakeSummary("Epoch/" + summary_scope + "_Avg_Perr", avg_perr),
128 |       global_step_val)
129 |   summary_writer.add_summary(
130 |       MakeSummary("Epoch/" + summary_scope + "_Avg_Loss", avg_loss),
131 |       global_step_val)
132 |   summary_writer.add_summary(
133 |       MakeSummary("Epoch/" + summary_scope + "_MAP", mean_ap),
134 |           global_step_val)
135 |   summary_writer.add_summary(
136 |       MakeSummary("Epoch/" + summary_scope + "_GAP", gap),
137 |           global_step_val)
138 |   summary_writer.flush()
139 | 
140 |   info = ("epoch/eval number {0} | Avg_Hit@1: {1:.3f} | Avg_PERR: {2:.3f} "
141 |           "| MAP: {3:.3f} | GAP: {4:.3f} | Avg_Loss: {5:3f}").format(
142 |           epoch_id, avg_hit_at_one, avg_perr, mean_ap, gap, avg_loss)
143 |   return info
144 | 
145 | def GetListOfFeatureNamesAndSizes(feature_names, feature_sizes):
146 |   """Extract the list of feature names and the dimensionality of each feature
147 |      from string of comma separated values.
148 | 
149 |   Args:
150 |     feature_names: string containing comma separated list of feature names
151 |     feature_sizes: string containing comma separated list of feature sizes
152 | 
153 |   Returns:
154 |     List of the feature names and list of the dimensionality of each feature.
155 |     Elements in the first/second list are strings/integers.
156 |   """
157 |   list_of_feature_names = [
158 |       feature_names.strip() for feature_names in feature_names.split(',')]
159 |   list_of_feature_sizes = [
160 |       int(feature_sizes) for feature_sizes in feature_sizes.split(',')]
161 |   if len(list_of_feature_names) != len(list_of_feature_sizes):
162 |     logging.error("length of the feature names (=" +
163 |                   str(len(list_of_feature_names)) + ") != length of feature "
164 |                   "sizes (=" + str(len(list_of_feature_sizes)) + ")")
165 | 
166 |   return list_of_feature_names, list_of_feature_sizes
167 | 
168 | def clip_gradient_norms(gradients_to_variables, max_norm):
169 |   """Clips the gradients by the given value.
170 | 
171 |   Args:
172 |     gradients_to_variables: A list of gradient to variable pairs (tuples).
173 |     max_norm: the maximum norm value.
174 | 
175 |   Returns:
176 |     A list of clipped gradient to variable pairs.
177 |   """
178 |   clipped_grads_and_vars = []
179 |   for grad, var in gradients_to_variables:
180 |     if grad is not None:
181 |       if isinstance(grad, tf.IndexedSlices):
182 |         tmp = tf.clip_by_norm(grad.values, max_norm)
183 |         grad = tf.IndexedSlices(tmp, grad.indices, grad.dense_shape)
184 |       else:
185 |         grad = tf.clip_by_norm(grad, max_norm)
186 |     clipped_grads_and_vars.append((grad, var))
187 |   return clipped_grads_and_vars
188 | 
189 | def combine_gradients(tower_grads):
190 |   """Calculate the combined gradient for each shared variable across all towers.
191 | 
192 |   Note that this function provides a synchronization point across all towers.
193 | 
194 |   Args:
195 |     tower_grads: List of lists of (gradient, variable) tuples. The outer list
196 |       is over individual gradients. The inner list is over the gradient
197 |       calculation for each tower.
198 |   Returns:
199 |      List of pairs of (gradient, variable) where the gradient has been summed
200 |      across all towers.
201 |   """
202 |   filtered_grads = [[x for x in grad_list if x[0] is not None] for grad_list in tower_grads]
203 |   final_grads = []
204 |   for i in xrange(len(filtered_grads[0])):
205 |     grads = [filtered_grads[t][i] for t in xrange(len(filtered_grads))]
206 |     grad = tf.stack([x[0] for x in grads], 0)
207 |     grad = tf.reduce_sum(grad, 0)
208 |     final_grads.append((grad, filtered_grads[0][i][1],))
209 | 
210 |   return final_grads
211 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Binary for generating predictions over a set of videos."""
 16 | 
 17 | import os
 18 | import time
 19 | 
 20 | import numpy
 21 | import tensorflow as tf
 22 | 
 23 | from tensorflow import app
 24 | from tensorflow import flags
 25 | from tensorflow import gfile
 26 | from tensorflow import logging
 27 | 
 28 | import eval_util
 29 | import losses
 30 | import readers
 31 | import utils
 32 | 
 33 | FLAGS = flags.FLAGS
 34 | 
 35 | if __name__ == '__main__':
 36 |   flags.DEFINE_string("train_dir", "/tmp/yt8m_model/",
 37 |                       "The directory to load the model files from.")
 38 |   flags.DEFINE_string("checkpoint_file", "",
 39 |                       "If provided, this specific checkpoint file will be "
 40 |                       "used for inference. Otherwise, the latest checkpoint "
 41 |                       "from the train_dir' argument will be used instead.")
 42 |   flags.DEFINE_string("output_file", "",
 43 |                       "The file to save the predictions to.")
 44 |   flags.DEFINE_string(
 45 |       "input_data_pattern", "",
 46 |       "File glob defining the evaluation dataset in tensorflow.SequenceExample "
 47 |       "format. The SequenceExamples are expected to have an 'rgb' byte array "
 48 |       "sequence feature as well as a 'labels' int64 context feature.")
 49 | 
 50 |   # Model flags.
 51 |   flags.DEFINE_bool(
 52 |       "frame_features", False,
 53 |       "If set, then --input_data_pattern must be frame-level features. "
 54 |       "Otherwise, --input_data_pattern must be aggregated video-level "
 55 |       "features. The model must also be set appropriately (i.e. to read 3D "
 56 |       "batches VS 4D batches.")
 57 |   flags.DEFINE_integer(
 58 |       "batch_size", 8192,
 59 |       "How many examples to process per batch.")
 60 |   flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature "
 61 |                       "to use for training.")
 62 |   flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.")
 63 | 
 64 | 
 65 |   # Other flags.
 66 |   flags.DEFINE_integer("num_readers", 1,
 67 |                        "How many threads to use for reading input files.")
 68 |   flags.DEFINE_integer("top_k", 20,
 69 |                        "How many predictions to output per video.")
 70 | 
 71 | def format_lines(video_ids, predictions, top_k):
 72 |   batch_size = len(video_ids)
 73 |   for video_index in range(batch_size):
 74 |     top_indices = numpy.argpartition(predictions[video_index], -top_k)[-top_k:]
 75 |     line = [(class_index, predictions[video_index][class_index])
 76 |             for class_index in top_indices]
 77 |     line = sorted(line, key=lambda p: -p[1])
 78 |     yield video_ids[video_index].decode('utf-8') + "," + " ".join("%i %f" % pair
 79 |                                                   for pair in line) + "\n"
 80 | 
 81 | 
 82 | def get_input_data_tensors(reader, data_pattern, batch_size, num_readers=1):
 83 |   """Creates the section of the graph which reads the input data.
 84 | 
 85 |   Args:
 86 |     reader: A class which parses the input data.
 87 |     data_pattern: A 'glob' style path to the data files.
 88 |     batch_size: How many examples to process at a time.
 89 |     num_readers: How many I/O threads to use.
 90 | 
 91 |   Returns:
 92 |     A tuple containing the features tensor, labels tensor, and optionally a
 93 |     tensor containing the number of frames per video. The exact dimensions
 94 |     depend on the reader being used.
 95 | 
 96 |   Raises:
 97 |     IOError: If no files matching the given pattern were found.
 98 |   """
 99 |   with tf.name_scope("input"):
100 |     files = gfile.Glob(data_pattern)
101 |     if not files:
102 |       raise IOError("Unable to find input files. data_pattern='" +
103 |                     data_pattern + "'")
104 |     logging.info("number of input files: " + str(len(files)))
105 |     filename_queue = tf.train.string_input_producer(
106 |         files, num_epochs=1, shuffle=False)
107 |     examples_and_labels = [reader.prepare_reader(filename_queue)
108 |                            for _ in range(num_readers)]
109 | 
110 |     video_id_batch, video_batch, unused_labels, num_frames_batch = (
111 |         tf.train.batch_join(examples_and_labels,
112 |                             batch_size=batch_size,
113 |                             allow_smaller_final_batch=True,
114 |                             enqueue_many=True))
115 |     return video_id_batch, video_batch, num_frames_batch
116 | 
117 | def inference(reader, checkpoint_file, train_dir, data_pattern, out_file_location, batch_size, top_k):
118 |   with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess, gfile.Open(out_file_location, "w+") as out_file:
119 |     video_id_batch, video_batch, num_frames_batch = get_input_data_tensors(reader, data_pattern, batch_size)
120 |     if checkpoint_file:
121 |       if not gfile.Exists(checkpoint_file + ".meta"):
122 |         logging.fatal("Unable to find checkpoint file at provided location '%s'" % checkpoint_file)
123 |       latest_checkpoint = checkpoint_file
124 |     else:
125 |       latest_checkpoint = tf.train.latest_checkpoint(train_dir)
126 |     if latest_checkpoint is None:
127 |       raise Exception("unable to find a checkpoint at location: %s" % train_dir)
128 |     else:
129 |       meta_graph_location = latest_checkpoint + ".meta"
130 |       logging.info("loading meta-graph: " + meta_graph_location)
131 |     saver = tf.train.import_meta_graph(meta_graph_location, clear_devices=True)
132 |     logging.info("restoring variables from " + latest_checkpoint)
133 |     saver.restore(sess, latest_checkpoint)
134 |     input_tensor = tf.get_collection("input_batch_raw")[0]
135 |     num_frames_tensor = tf.get_collection("num_frames")[0]
136 |     predictions_tensor = tf.get_collection("predictions")[0]
137 | 
138 |     # Workaround for num_epochs issue.
139 |     def set_up_init_ops(variables):
140 |       init_op_list = []
141 |       for variable in list(variables):
142 |         if "train_input" in variable.name:
143 |           init_op_list.append(tf.assign(variable, 1))
144 |           variables.remove(variable)
145 |       init_op_list.append(tf.variables_initializer(variables))
146 |       return init_op_list
147 | 
148 |     sess.run(set_up_init_ops(tf.get_collection_ref(
149 |         tf.GraphKeys.LOCAL_VARIABLES)))
150 | 
151 |     coord = tf.train.Coordinator()
152 |     threads = tf.train.start_queue_runners(sess=sess, coord=coord)
153 |     num_examples_processed = 0
154 |     start_time = time.time()
155 |     out_file.write("VideoId,LabelConfidencePairs\n")
156 | 
157 |     try:
158 |       while not coord.should_stop():
159 |           video_id_batch_val, video_batch_val,num_frames_batch_val = sess.run([video_id_batch, video_batch, num_frames_batch])
160 |           predictions_val, = sess.run([predictions_tensor], feed_dict={input_tensor: video_batch_val, num_frames_tensor: num_frames_batch_val})
161 |           now = time.time()
162 |           num_examples_processed += len(video_batch_val)
163 |           num_classes = predictions_val.shape[1]
164 |           logging.info("num examples processed: " + str(num_examples_processed) + " elapsed seconds: " + "{0:.2f}".format(now-start_time))
165 |           for line in format_lines(video_id_batch_val, predictions_val, top_k):
166 |             out_file.write(line)
167 |           out_file.flush()
168 | 
169 | 
170 |     except tf.errors.OutOfRangeError:
171 |         logging.info('Done with inference. The output file was written to ' + out_file_location)
172 |     finally:
173 |         coord.request_stop()
174 | 
175 |     coord.join(threads)
176 |     sess.close()
177 | 
178 | 
179 | def main(unused_argv):
180 |   logging.set_verbosity(tf.logging.INFO)
181 | 
182 |   # convert feature_names and feature_sizes to lists of values
183 |   feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes(
184 |       FLAGS.feature_names, FLAGS.feature_sizes)
185 | 
186 |   if FLAGS.frame_features:
187 |     reader = readers.YT8MFrameFeatureReader(feature_names=feature_names,
188 |                                             feature_sizes=feature_sizes)
189 |   else:
190 |     reader = readers.YT8MAggregatedFeatureReader(feature_names=feature_names,
191 |                                                  feature_sizes=feature_sizes)
192 | 
193 |   if FLAGS.output_file is "":
194 |     raise ValueError("'output_file' was not specified. "
195 |       "Unable to continue with inference.")
196 | 
197 |   if FLAGS.input_data_pattern is "":
198 |     raise ValueError("'input_data_pattern' was not specified. "
199 |       "Unable to continue with inference.")
200 | 
201 |   inference(reader, FLAGS.checkpoint_file, FLAGS.train_dir, FLAGS.input_data_pattern,
202 |     FLAGS.output_file, FLAGS.batch_size, FLAGS.top_k)
203 | 
204 | 
205 | if __name__ == "__main__":
206 |   app.run()
207 | 


--------------------------------------------------------------------------------
/eval_util.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Provides functions to help with evaluating models."""
 16 | import datetime
 17 | import numpy
 18 | 
 19 | from tensorflow.python.platform import gfile
 20 | 
 21 | import mean_average_precision_calculator as map_calculator
 22 | import average_precision_calculator as ap_calculator
 23 | 
 24 | def flatten(l):
 25 |   """ Merges a list of lists into a single list. """
 26 |   return [item for sublist in l for item in sublist]
 27 | 
 28 | def calculate_hit_at_one(predictions, actuals):
 29 |   """Performs a local (numpy) calculation of the hit at one.
 30 | 
 31 |   Args:
 32 |     predictions: Matrix containing the outputs of the model.
 33 |       Dimensions are 'batch' x 'num_classes'.
 34 |     actuals: Matrix containing the ground truth labels.
 35 |       Dimensions are 'batch' x 'num_classes'.
 36 | 
 37 |   Returns:
 38 |     float: The average hit at one across the entire batch.
 39 |   """
 40 |   top_prediction = numpy.argmax(predictions, 1)
 41 |   hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]
 42 |   return numpy.average(hits)
 43 | 
 44 | 
 45 | def calculate_precision_at_equal_recall_rate(predictions, actuals):
 46 |   """Performs a local (numpy) calculation of the PERR.
 47 | 
 48 |   Args:
 49 |     predictions: Matrix containing the outputs of the model.
 50 |       Dimensions are 'batch' x 'num_classes'.
 51 |     actuals: Matrix containing the ground truth labels.
 52 |       Dimensions are 'batch' x 'num_classes'.
 53 | 
 54 |   Returns:
 55 |     float: The average precision at equal recall rate across the entire batch.
 56 |   """
 57 |   aggregated_precision = 0.0
 58 |   num_videos = actuals.shape[0]
 59 |   for row in numpy.arange(num_videos):
 60 |     num_labels = int(numpy.sum(actuals[row]))
 61 |     top_indices = numpy.argpartition(predictions[row],
 62 |                                      -num_labels)[-num_labels:]
 63 |     item_precision = 0.0
 64 |     for label_index in top_indices:
 65 |       if predictions[row][label_index] > 0:
 66 |         item_precision += actuals[row][label_index]
 67 |     item_precision /= top_indices.size
 68 |     aggregated_precision += item_precision
 69 |   aggregated_precision /= num_videos
 70 |   return aggregated_precision
 71 | 
 72 | def calculate_gap(predictions, actuals, top_k=20):
 73 |   """Performs a local (numpy) calculation of the global average precision.
 74 | 
 75 |   Only the top_k predictions are taken for each of the videos.
 76 | 
 77 |   Args:
 78 |     predictions: Matrix containing the outputs of the model.
 79 |       Dimensions are 'batch' x 'num_classes'.
 80 |     actuals: Matrix containing the ground truth labels.
 81 |       Dimensions are 'batch' x 'num_classes'.
 82 |     top_k: How many predictions to use per video.
 83 | 
 84 |   Returns:
 85 |     float: The global average precision.
 86 |   """
 87 |   gap_calculator = ap_calculator.AveragePrecisionCalculator()
 88 |   sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, actuals, top_k)
 89 |   gap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
 90 |   return gap_calculator.peek_ap_at_n()
 91 | 
 92 | 
 93 | def top_k_by_class(predictions, labels, k=20):
 94 |   """Extracts the top k predictions for each video, sorted by class.
 95 | 
 96 |   Args:
 97 |     predictions: A numpy matrix containing the outputs of the model.
 98 |       Dimensions are 'batch' x 'num_classes'.
 99 |     k: the top k non-zero entries to preserve in each prediction.
100 | 
101 |   Returns:
102 |     A tuple (predictions,labels, true_positives). 'predictions' and 'labels'
103 |     are lists of lists of floats. 'true_positives' is a list of scalars. The
104 |     length of the lists are equal to the number of classes. The entries in the
105 |     predictions variable are probability predictions, and
106 |     the corresponding entries in the labels variable are the ground truth for
107 |     those predictions. The entries in 'true_positives' are the number of true
108 |     positives for each class in the ground truth.
109 | 
110 |   Raises:
111 |     ValueError: An error occurred when the k is not a positive integer.
112 |   """
113 |   if k <= 0:
114 |     raise ValueError("k must be a positive integer.")
115 |   k = min(k, predictions.shape[1])
116 |   num_classes = predictions.shape[1]
117 |   prediction_triplets= []
118 |   for video_index in range(predictions.shape[0]):
119 |     prediction_triplets.extend(top_k_triplets(predictions[video_index],labels[video_index], k))
120 |   out_predictions = [[] for v in range(num_classes)]
121 |   out_labels = [[] for v in range(num_classes)]
122 |   for triplet in prediction_triplets:
123 |     out_predictions[triplet[0]].append(triplet[1])
124 |     out_labels[triplet[0]].append(triplet[2])
125 |   out_true_positives = [numpy.sum(labels[:,i]) for i in range(num_classes)]
126 | 
127 |   return out_predictions, out_labels, out_true_positives
128 | 
129 | def top_k_triplets(predictions, labels, k=20):
130 |   """Get the top_k for a 1-d numpy array. Returns a sparse list of tuples in
131 |   (prediction, class) format"""
132 |   m = len(predictions)
133 |   k = min(k, m)
134 |   indices = numpy.argpartition(predictions, -k)[-k:]
135 |   return [(index, predictions[index], labels[index]) for index in indices]
136 | 
137 | class EvaluationMetrics(object):
138 |   """A class to store the evaluation metrics."""
139 | 
140 |   def __init__(self, num_class, top_k):
141 |     """Construct an EvaluationMetrics object to store the evaluation metrics.
142 | 
143 |     Args:
144 |       num_class: A positive integer specifying the number of classes.
145 |       top_k: A positive integer specifying how many predictions are considered per video.
146 | 
147 |     Raises:
148 |       ValueError: An error occurred when MeanAveragePrecisionCalculator cannot
149 |         not be constructed.
150 |     """
151 |     self.sum_hit_at_one = 0.0
152 |     self.sum_perr = 0.0
153 |     self.sum_loss = 0.0
154 |     self.map_calculator = map_calculator.MeanAveragePrecisionCalculator(num_class)
155 |     self.global_ap_calculator = ap_calculator.AveragePrecisionCalculator()
156 |     self.top_k = top_k
157 |     self.num_examples = 0
158 | 
159 |   def accumulate(self, predictions, labels, loss):
160 |     """Accumulate the metrics calculated locally for this mini-batch.
161 | 
162 |     Args:
163 |       predictions: A numpy matrix containing the outputs of the model.
164 |         Dimensions are 'batch' x 'num_classes'.
165 |       labels: A numpy matrix containing the ground truth labels.
166 |         Dimensions are 'batch' x 'num_classes'.
167 |       loss: A numpy array containing the loss for each sample.
168 | 
169 |     Returns:
170 |       dictionary: A dictionary storing the metrics for the mini-batch.
171 | 
172 |     Raises:
173 |       ValueError: An error occurred when the shape of predictions and actuals
174 |         does not match.
175 |     """
176 |     batch_size = labels.shape[0]
177 |     mean_hit_at_one = calculate_hit_at_one(predictions, labels)
178 |     mean_perr = calculate_precision_at_equal_recall_rate(predictions, labels)
179 |     mean_loss = numpy.mean(loss)
180 | 
181 |     # Take the top 20 predictions.
182 |     sparse_predictions, sparse_labels, num_positives = top_k_by_class(predictions, labels, self.top_k)
183 |     self.map_calculator.accumulate(sparse_predictions, sparse_labels, num_positives)
184 |     self.global_ap_calculator.accumulate(flatten(sparse_predictions), flatten(sparse_labels), sum(num_positives))
185 | 
186 |     self.num_examples += batch_size
187 |     self.sum_hit_at_one += mean_hit_at_one * batch_size
188 |     self.sum_perr += mean_perr * batch_size
189 |     self.sum_loss += mean_loss * batch_size
190 | 
191 |     return {"hit_at_one": mean_hit_at_one, "perr": mean_perr, "loss": mean_loss}
192 | 
193 |   def get(self):
194 |     """Calculate the evaluation metrics for the whole epoch.
195 | 
196 |     Raises:
197 |       ValueError: If no examples were accumulated.
198 | 
199 |     Returns:
200 |       dictionary: a dictionary storing the evaluation metrics for the epoch. The
201 |         dictionary has the fields: avg_hit_at_one, avg_perr, avg_loss, and
202 |         aps (default nan).
203 |     """
204 |     if self.num_examples <= 0:
205 |       raise ValueError("total_sample must be positive.")
206 |     avg_hit_at_one = self.sum_hit_at_one / self.num_examples
207 |     avg_perr = self.sum_perr / self.num_examples
208 |     avg_loss = self.sum_loss / self.num_examples
209 | 
210 |     aps = self.map_calculator.peek_map_at_n()
211 |     gap = self.global_ap_calculator.peek_ap_at_n()
212 | 
213 |     epoch_info_dict = {}
214 |     return {"avg_hit_at_one": avg_hit_at_one, "avg_perr": avg_perr,
215 |             "avg_loss": avg_loss, "aps": aps, "gap": gap}
216 | 
217 |   def clear(self):
218 |     """Clear the evaluation metrics and reset the EvaluationMetrics object."""
219 |     self.sum_hit_at_one = 0.0
220 |     self.sum_perr = 0.0
221 |     self.sum_loss = 0.0
222 |     self.map_calculator.clear()
223 |     self.global_ap_calculator.clear()
224 |     self.num_examples = 0
225 | 


--------------------------------------------------------------------------------
/frame_level_models.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains a collection of models which operate on variable-length sequences.
 16 | """
 17 | import math
 18 | 
 19 | import models
 20 | import video_level_models
 21 | import tensorflow as tf
 22 | import model_utils as utils
 23 | 
 24 | import tensorflow.contrib.slim as slim
 25 | from tensorflow import flags
 26 | 
 27 | FLAGS = flags.FLAGS
 28 | flags.DEFINE_integer("iterations", 30,
 29 |                      "Number of frames per batch for DBoF.")
 30 | flags.DEFINE_bool("dbof_add_batch_norm", True,
 31 |                   "Adds batch normalization to the DBoF model.")
 32 | flags.DEFINE_bool(
 33 |     "sample_random_frames", True,
 34 |     "If true samples random frames (for frame level models). If false, a random"
 35 |     "sequence of frames is sampled instead.")
 36 | flags.DEFINE_integer("dbof_cluster_size", 8192,
 37 |                      "Number of units in the DBoF cluster layer.")
 38 | flags.DEFINE_integer("dbof_hidden_size", 1024,
 39 |                      "Number of units in the DBoF hidden layer.")
 40 | flags.DEFINE_string("dbof_pooling_method", "max",
 41 |                     "The pooling method used in the DBoF cluster layer. "
 42 |                     "Choices are 'average' and 'max'.")
 43 | flags.DEFINE_string("video_level_classifier_model", "MoeModel",
 44 |                     "Some Frame-Level models can be decomposed into a "
 45 |                     "generalized pooling operation followed by a "
 46 |                     "classifier layer")
 47 | flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.")
 48 | flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.")
 49 | 
 50 | class FrameLevelLogisticModel(models.BaseModel):
 51 | 
 52 |   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
 53 |     """Creates a model which uses a logistic classifier over the average of the
 54 |     frame-level features.
 55 | 
 56 |     This class is intended to be an example for implementors of frame level
 57 |     models. If you want to train a model over averaged features it is more
 58 |     efficient to average them beforehand rather than on the fly.
 59 | 
 60 |     Args:
 61 |       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
 62 |                    input features.
 63 |       vocab_size: The number of classes in the dataset.
 64 |       num_frames: A vector of length 'batch' which indicates the number of
 65 |            frames for each video (before padding).
 66 | 
 67 |     Returns:
 68 |       A dictionary with a tensor containing the probability predictions of the
 69 |       model in the 'predictions' key. The dimensions of the tensor are
 70 |       'batch_size' x 'num_classes'.
 71 |     """
 72 |     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
 73 |     feature_size = model_input.get_shape().as_list()[2]
 74 | 
 75 |     denominators = tf.reshape(
 76 |         tf.tile(num_frames, [1, feature_size]), [-1, feature_size])
 77 |     avg_pooled = tf.reduce_sum(model_input,
 78 |                                axis=[1]) / denominators
 79 | 
 80 |     output = slim.fully_connected(
 81 |         avg_pooled, vocab_size, activation_fn=tf.nn.sigmoid,
 82 |         weights_regularizer=slim.l2_regularizer(1e-8))
 83 |     return {"predictions": output}
 84 | 
 85 | class DbofModel(models.BaseModel):
 86 |   """Creates a Deep Bag of Frames model.
 87 | 
 88 |   The model projects the features for each frame into a higher dimensional
 89 |   'clustering' space, pools across frames in that space, and then
 90 |   uses a configurable video-level model to classify the now aggregated features.
 91 | 
 92 |   The model will randomly sample either frames or sequences of frames during
 93 |   training to speed up convergence.
 94 | 
 95 |   Args:
 96 |     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
 97 |                  input features.
 98 |     vocab_size: The number of classes in the dataset.
 99 |     num_frames: A vector of length 'batch' which indicates the number of
100 |          frames for each video (before padding).
101 | 
102 |   Returns:
103 |     A dictionary with a tensor containing the probability predictions of the
104 |     model in the 'predictions' key. The dimensions of the tensor are
105 |     'batch_size' x 'num_classes'.
106 |   """
107 | 
108 |   def create_model(self,
109 |                    model_input,
110 |                    vocab_size,
111 |                    num_frames,
112 |                    iterations=None,
113 |                    add_batch_norm=None,
114 |                    sample_random_frames=None,
115 |                    cluster_size=None,
116 |                    hidden_size=None,
117 |                    is_training=True,
118 |                    **unused_params):
119 |     iterations = iterations or FLAGS.iterations
120 |     add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
121 |     random_frames = sample_random_frames or FLAGS.sample_random_frames
122 |     cluster_size = cluster_size or FLAGS.dbof_cluster_size
123 |     hidden1_size = hidden_size or FLAGS.dbof_hidden_size
124 | 
125 |     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
126 |     if random_frames:
127 |       model_input = utils.SampleRandomFrames(model_input, num_frames,
128 |                                              iterations)
129 |     else:
130 |       model_input = utils.SampleRandomSequence(model_input, num_frames,
131 |                                                iterations)
132 |     max_frames = model_input.get_shape().as_list()[1]
133 |     feature_size = model_input.get_shape().as_list()[2]
134 |     reshaped_input = tf.reshape(model_input, [-1, feature_size])
135 |     tf.summary.histogram("input_hist", reshaped_input)
136 | 
137 |     if add_batch_norm:
138 |       reshaped_input = slim.batch_norm(
139 |           reshaped_input,
140 |           center=True,
141 |           scale=True,
142 |           is_training=is_training,
143 |           scope="input_bn")
144 | 
145 |     cluster_weights = tf.get_variable("cluster_weights",
146 |       [feature_size, cluster_size],
147 |       initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
148 |     tf.summary.histogram("cluster_weights", cluster_weights)
149 |     activation = tf.matmul(reshaped_input, cluster_weights)
150 |     if add_batch_norm:
151 |       activation = slim.batch_norm(
152 |           activation,
153 |           center=True,
154 |           scale=True,
155 |           is_training=is_training,
156 |           scope="cluster_bn")
157 |     else:
158 |       cluster_biases = tf.get_variable("cluster_biases",
159 |         [cluster_size],
160 |         initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
161 |       tf.summary.histogram("cluster_biases", cluster_biases)
162 |       activation += cluster_biases
163 |     activation = tf.nn.relu6(activation)
164 |     tf.summary.histogram("cluster_output", activation)
165 | 
166 |     activation = tf.reshape(activation, [-1, max_frames, cluster_size])
167 |     activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)
168 | 
169 |     hidden1_weights = tf.get_variable("hidden1_weights",
170 |       [cluster_size, hidden1_size],
171 |       initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
172 |     tf.summary.histogram("hidden1_weights", hidden1_weights)
173 |     activation = tf.matmul(activation, hidden1_weights)
174 |     if add_batch_norm:
175 |       activation = slim.batch_norm(
176 |           activation,
177 |           center=True,
178 |           scale=True,
179 |           is_training=is_training,
180 |           scope="hidden1_bn")
181 |     else:
182 |       hidden1_biases = tf.get_variable("hidden1_biases",
183 |         [hidden1_size],
184 |         initializer = tf.random_normal_initializer(stddev=0.01))
185 |       tf.summary.histogram("hidden1_biases", hidden1_biases)
186 |       activation += hidden1_biases
187 |     activation = tf.nn.relu6(activation)
188 |     tf.summary.histogram("hidden1_output", activation)
189 | 
190 |     aggregated_model = getattr(video_level_models,
191 |                                FLAGS.video_level_classifier_model)
192 |     return aggregated_model().create_model(
193 |         model_input=activation,
194 |         vocab_size=vocab_size,
195 |         **unused_params)
196 | 
197 | class LstmModel(models.BaseModel):
198 | 
199 |   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
200 |     """Creates a model which uses a stack of LSTMs to represent the video.
201 | 
202 |     Args:
203 |       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
204 |                    input features.
205 |       vocab_size: The number of classes in the dataset.
206 |       num_frames: A vector of length 'batch' which indicates the number of
207 |            frames for each video (before padding).
208 | 
209 |     Returns:
210 |       A dictionary with a tensor containing the probability predictions of the
211 |       model in the 'predictions' key. The dimensions of the tensor are
212 |       'batch_size' x 'num_classes'.
213 |     """
214 |     lstm_size = FLAGS.lstm_cells
215 |     number_of_layers = FLAGS.lstm_layers
216 | 
217 |     stacked_lstm = tf.contrib.rnn.MultiRNNCell(
218 |             [
219 |                 tf.contrib.rnn.BasicLSTMCell(
220 |                     lstm_size, forget_bias=1.0)
221 |                 for _ in range(number_of_layers)
222 |                 ])
223 | 
224 |     loss = 0.0
225 | 
226 |     outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
227 |                                        sequence_length=num_frames,
228 |                                        dtype=tf.float32)
229 | 
230 |     aggregated_model = getattr(video_level_models,
231 |                                FLAGS.video_level_classifier_model)
232 | 
233 |     return aggregated_model().create_model(
234 |         model_input=state[-1].h,
235 |         vocab_size=vocab_size,
236 |         **unused_params)
237 | 


--------------------------------------------------------------------------------
/average_precision_calculator.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Calculate or keep track of the interpolated average precision.
 16 | 
 17 | It provides an interface for calculating interpolated average precision for an
 18 | entire list or the top-n ranked items. For the definition of the
 19 | (non-)interpolated average precision:
 20 | http://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf
 21 | 
 22 | Example usages:
 23 | 1) Use it as a static function call to directly calculate average precision for
 24 | a short ranked list in the memory.
 25 | 
 26 | ```
 27 | import random
 28 | 
 29 | p = np.array([random.random() for _ in xrange(10)])
 30 | a = np.array([random.choice([0, 1]) for _ in xrange(10)])
 31 | 
 32 | ap = average_precision_calculator.AveragePrecisionCalculator.ap(p, a)
 33 | ```
 34 | 
 35 | 2) Use it as an object for long ranked list that cannot be stored in memory or
 36 | the case where partial predictions can be observed at a time (Tensorflow
 37 | predictions). In this case, we first call the function accumulate many times
 38 | to process parts of the ranked list. After processing all the parts, we call
 39 | peek_interpolated_ap_at_n.
 40 | ```
 41 | p1 = np.array([random.random() for _ in xrange(5)])
 42 | a1 = np.array([random.choice([0, 1]) for _ in xrange(5)])
 43 | p2 = np.array([random.random() for _ in xrange(5)])
 44 | a2 = np.array([random.choice([0, 1]) for _ in xrange(5)])
 45 | 
 46 | # interpolated average precision at 10 using 1000 break points
 47 | calculator = average_precision_calculator.AveragePrecisionCalculator(10)
 48 | calculator.accumulate(p1, a1)
 49 | calculator.accumulate(p2, a2)
 50 | ap3 = calculator.peek_ap_at_n()
 51 | ```
 52 | """
 53 | 
 54 | import heapq
 55 | import random
 56 | import numbers
 57 | 
 58 | import numpy
 59 | 
 60 | 
 61 | class AveragePrecisionCalculator(object):
 62 |   """Calculate the average precision and average precision at n."""
 63 | 
 64 |   def __init__(self, top_n=None):
 65 |     """Construct an AveragePrecisionCalculator to calculate average precision.
 66 | 
 67 |     This class is used to calculate the average precision for a single label.
 68 | 
 69 |     Args:
 70 |       top_n: A positive Integer specifying the average precision at n, or
 71 |         None to use all provided data points.
 72 | 
 73 |     Raises:
 74 |       ValueError: An error occurred when the top_n is not a positive integer.
 75 |     """
 76 |     if not ((isinstance(top_n, int) and top_n >= 0) or top_n is None):
 77 |       raise ValueError("top_n must be a positive integer or None.")
 78 | 
 79 |     self._top_n = top_n  # average precision at n
 80 |     self._total_positives = 0  # total number of positives have seen
 81 |     self._heap = []  # max heap of (prediction, actual)
 82 | 
 83 |   @property
 84 |   def heap_size(self):
 85 |     """Gets the heap size maintained in the class."""
 86 |     return len(self._heap)
 87 | 
 88 |   @property
 89 |   def num_accumulated_positives(self):
 90 |     """Gets the number of positive samples that have been accumulated."""
 91 |     return self._total_positives
 92 | 
 93 |   def accumulate(self, predictions, actuals, num_positives=None):
 94 |     """Accumulate the predictions and their ground truth labels.
 95 | 
 96 |     After the function call, we may call peek_ap_at_n to actually calculate
 97 |     the average precision.
 98 |     Note predictions and actuals must have the same shape.
 99 | 
100 |     Args:
101 |       predictions: a list storing the prediction scores.
102 |       actuals: a list storing the ground truth labels. Any value
103 |       larger than 0 will be treated as positives, otherwise as negatives.
104 |       num_positives = If the 'predictions' and 'actuals' inputs aren't complete,
105 |       then it's possible some true positives were missed in them. In that case,
106 |       you can provide 'num_positives' in order to accurately track recall.
107 | 
108 |     Raises:
109 |       ValueError: An error occurred when the format of the input is not the
110 |       numpy 1-D array or the shape of predictions and actuals does not match.
111 |     """
112 |     if len(predictions) != len(actuals):
113 |       raise ValueError("the shape of predictions and actuals does not match.")
114 | 
115 |     if not num_positives is None:
116 |       if not isinstance(num_positives, numbers.Number) or num_positives < 0:
117 |         raise ValueError("'num_positives' was provided but it wan't a nonzero number.")
118 | 
119 |     if not num_positives is None:
120 |       self._total_positives += num_positives
121 |     else:
122 |       self._total_positives += numpy.size(numpy.where(actuals > 0))
123 |     topk = self._top_n
124 |     heap = self._heap
125 | 
126 |     for i in range(numpy.size(predictions)):
127 |       if topk is None or len(heap) < topk:
128 |         heapq.heappush(heap, (predictions[i], actuals[i]))
129 |       else:
130 |         if predictions[i] > heap[0][0]:  # heap[0] is the smallest
131 |           heapq.heappop(heap)
132 |           heapq.heappush(heap, (predictions[i], actuals[i]))
133 | 
134 |   def clear(self):
135 |     """Clear the accumulated predictions."""
136 |     self._heap = []
137 |     self._total_positives = 0
138 | 
139 |   def peek_ap_at_n(self):
140 |     """Peek the non-interpolated average precision at n.
141 | 
142 |     Returns:
143 |       The non-interpolated average precision at n (default 0).
144 |       If n is larger than the length of the ranked list,
145 |       the average precision will be returned.
146 |     """
147 |     if self.heap_size <= 0:
148 |       return 0
149 |     predlists = numpy.array(list(zip(*self._heap)))
150 | 
151 |     ap = self.ap_at_n(predlists[0],
152 |                       predlists[1],
153 |                       n=self._top_n,
154 |                       total_num_positives=self._total_positives)
155 |     return ap
156 | 
157 |   @staticmethod
158 |   def ap(predictions, actuals):
159 |     """Calculate the non-interpolated average precision.
160 | 
161 |     Args:
162 |       predictions: a numpy 1-D array storing the sparse prediction scores.
163 |       actuals: a numpy 1-D array storing the ground truth labels. Any value
164 |       larger than 0 will be treated as positives, otherwise as negatives.
165 | 
166 |     Returns:
167 |       The non-interpolated average precision at n.
168 |       If n is larger than the length of the ranked list,
169 |       the average precision will be returned.
170 | 
171 |     Raises:
172 |       ValueError: An error occurred when the format of the input is not the
173 |       numpy 1-D array or the shape of predictions and actuals does not match.
174 |     """
175 |     return AveragePrecisionCalculator.ap_at_n(predictions,
176 |                                               actuals,
177 |                                               n=None)
178 | 
179 |   @staticmethod
180 |   def ap_at_n(predictions, actuals, n=20, total_num_positives=None):
181 |     """Calculate the non-interpolated average precision.
182 | 
183 |     Args:
184 |       predictions: a numpy 1-D array storing the sparse prediction scores.
185 |       actuals: a numpy 1-D array storing the ground truth labels. Any value
186 |       larger than 0 will be treated as positives, otherwise as negatives.
187 |       n: the top n items to be considered in ap@n.
188 |       total_num_positives : (optionally) you can specify the number of total
189 |         positive
190 |       in the list. If specified, it will be used in calculation.
191 | 
192 |     Returns:
193 |       The non-interpolated average precision at n.
194 |       If n is larger than the length of the ranked list,
195 |       the average precision will be returned.
196 | 
197 |     Raises:
198 |       ValueError: An error occurred when
199 |       1) the format of the input is not the numpy 1-D array;
200 |       2) the shape of predictions and actuals does not match;
201 |       3) the input n is not a positive integer.
202 |     """
203 |     if len(predictions) != len(actuals):
204 |       raise ValueError("the shape of predictions and actuals does not match.")
205 | 
206 |     if n is not None:
207 |       if not isinstance(n, int) or n <= 0:
208 |         raise ValueError("n must be 'None' or a positive integer."
209 |                          " It was '%s'." % n)
210 | 
211 |     ap = 0.0
212 | 
213 |     predictions = numpy.array(predictions)
214 |     actuals = numpy.array(actuals)
215 | 
216 |     # add a shuffler to avoid overestimating the ap
217 |     predictions, actuals = AveragePrecisionCalculator._shuffle(predictions,
218 |                                                                actuals)
219 |     sortidx = sorted(
220 |         range(len(predictions)),
221 |         key=lambda k: predictions[k],
222 |         reverse=True)
223 | 
224 |     if total_num_positives is None:
225 |       numpos = numpy.size(numpy.where(actuals > 0))
226 |     else:
227 |       numpos = total_num_positives
228 | 
229 |     if numpos == 0:
230 |       return 0
231 | 
232 |     if n is not None:
233 |       numpos = min(numpos, n)
234 |     delta_recall = 1.0 / numpos
235 |     poscount = 0.0
236 | 
237 |     # calculate the ap
238 |     r = len(sortidx)
239 |     if n is not None:
240 |       r = min(r, n)
241 |     for i in range(r):
242 |       if actuals[sortidx[i]] > 0:
243 |         poscount += 1
244 |         ap += poscount / (i + 1) * delta_recall
245 |     return ap
246 | 
247 |   @staticmethod
248 |   def _shuffle(predictions, actuals):
249 |     random.seed(0)
250 |     suffidx = random.sample(range(len(predictions)), len(predictions))
251 |     predictions = predictions[suffidx]
252 |     actuals = actuals[suffidx]
253 |     return predictions, actuals
254 | 
255 |   @staticmethod
256 |   def _zero_one_normalize(predictions, epsilon=1e-7):
257 |     """Normalize the predictions to the range between 0.0 and 1.0.
258 | 
259 |     For some predictions like SVM predictions, we need to normalize them before
260 |     calculate the interpolated average precision. The normalization will not
261 |     change the rank in the original list and thus won't change the average
262 |     precision.
263 | 
264 |     Args:
265 |       predictions: a numpy 1-D array storing the sparse prediction scores.
266 |       epsilon: a small constant to avoid denominator being zero.
267 | 
268 |     Returns:
269 |       The normalized prediction.
270 |     """
271 |     denominator = numpy.max(predictions) - numpy.min(predictions)
272 |     ret = (predictions - numpy.min(predictions)) / numpy.max(denominator,
273 |                                                              epsilon)
274 |     return ret
275 | 


--------------------------------------------------------------------------------
/readers.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Provides readers configured for different datasets."""
 16 | 
 17 | import tensorflow as tf
 18 | import utils
 19 | 
 20 | from tensorflow import logging
 21 | def resize_axis(tensor, axis, new_size, fill_value=0):
 22 |   """Truncates or pads a tensor to new_size on on a given axis.
 23 | 
 24 |   Truncate or extend tensor such that tensor.shape[axis] == new_size. If the
 25 |   size increases, the padding will be performed at the end, using fill_value.
 26 | 
 27 |   Args:
 28 |     tensor: The tensor to be resized.
 29 |     axis: An integer representing the dimension to be sliced.
 30 |     new_size: An integer or 0d tensor representing the new value for
 31 |       tensor.shape[axis].
 32 |     fill_value: Value to use to fill any new entries in the tensor. Will be
 33 |       cast to the type of tensor.
 34 | 
 35 |   Returns:
 36 |     The resized tensor.
 37 |   """
 38 |   tensor = tf.convert_to_tensor(tensor)
 39 |   shape = tf.unstack(tf.shape(tensor))
 40 | 
 41 |   pad_shape = shape[:]
 42 |   pad_shape[axis] = tf.maximum(0, new_size - shape[axis])
 43 | 
 44 |   shape[axis] = tf.minimum(shape[axis], new_size)
 45 |   shape = tf.stack(shape)
 46 | 
 47 |   resized = tf.concat([
 48 |       tf.slice(tensor, tf.zeros_like(shape), shape),
 49 |       tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype))
 50 |   ], axis)
 51 | 
 52 |   # Update shape.
 53 |   new_shape = tensor.get_shape().as_list()  # A copy is being made.
 54 |   new_shape[axis] = new_size
 55 |   resized.set_shape(new_shape)
 56 |   return resized
 57 | 
 58 | class BaseReader(object):
 59 |   """Inherit from this class when implementing new readers."""
 60 | 
 61 |   def prepare_reader(self, unused_filename_queue):
 62 |     """Create a thread for generating prediction and label tensors."""
 63 |     raise NotImplementedError()
 64 | 
 65 | 
 66 | class YT8MAggregatedFeatureReader(BaseReader):
 67 |   """Reads TFRecords of pre-aggregated Examples.
 68 | 
 69 |   The TFRecords must contain Examples with a sparse int64 'labels' feature and
 70 |   a fixed length float32 feature, obtained from the features in 'feature_name'.
 71 |   The float features are assumed to be an average of dequantized values.
 72 |   """
 73 | 
 74 |   def __init__(self,
 75 |                num_classes=4716,
 76 |                feature_sizes=[1024],
 77 |                feature_names=["mean_inc3"]):
 78 |     """Construct a YT8MAggregatedFeatureReader.
 79 | 
 80 |     Args:
 81 |       num_classes: a positive integer for the number of classes.
 82 |       feature_sizes: positive integer(s) for the feature dimensions as a list.
 83 |       feature_names: the feature name(s) in the tensorflow record as a list.
 84 |     """
 85 | 
 86 |     assert len(feature_names) == len(feature_sizes), \
 87 |     "length of feature_names (={}) != length of feature_sizes (={})".format( \
 88 |     len(feature_names), len(feature_sizes))
 89 | 
 90 |     self.num_classes = num_classes
 91 |     self.feature_sizes = feature_sizes
 92 |     self.feature_names = feature_names
 93 | 
 94 |   def prepare_reader(self, filename_queue, batch_size=1024):
 95 |     """Creates a single reader thread for pre-aggregated YouTube 8M Examples.
 96 | 
 97 |     Args:
 98 |       filename_queue: A tensorflow queue of filename locations.
 99 | 
100 |     Returns:
101 |       A tuple of video indexes, features, labels, and padding data.
102 |     """
103 |     reader = tf.TFRecordReader()
104 |     _, serialized_examples = reader.read_up_to(filename_queue, batch_size)
105 | 
106 |     tf.add_to_collection("serialized_examples", serialized_examples)
107 |     return self.prepare_serialized_examples(serialized_examples)
108 | 
109 |   def prepare_serialized_examples(self, serialized_examples):
110 |     # set the mapping from the fields to data types in the proto
111 |     num_features = len(self.feature_names)
112 |     assert num_features > 0, "self.feature_names is empty!"
113 |     assert len(self.feature_names) == len(self.feature_sizes), \
114 |     "length of feature_names (={}) != length of feature_sizes (={})".format( \
115 |     len(self.feature_names), len(self.feature_sizes))
116 | 
117 |     feature_map = {"video_id": tf.FixedLenFeature([], tf.string),
118 |                    "labels": tf.VarLenFeature(tf.int64)}
119 |     for feature_index in range(num_features):
120 |       feature_map[self.feature_names[feature_index]] = tf.FixedLenFeature(
121 |           [self.feature_sizes[feature_index]], tf.float32)
122 | 
123 |     features = tf.parse_example(serialized_examples, features=feature_map)
124 |     labels = tf.sparse_to_indicator(features["labels"], self.num_classes)
125 |     labels.set_shape([None, self.num_classes])
126 |     concatenated_features = tf.concat([
127 |         features[feature_name] for feature_name in self.feature_names], 1)
128 | 
129 |     return features["video_id"], concatenated_features, labels, tf.ones([tf.shape(serialized_examples)[0]])
130 | 
131 | class YT8MFrameFeatureReader(BaseReader):
132 |   """Reads TFRecords of SequenceExamples.
133 | 
134 |   The TFRecords must contain SequenceExamples with the sparse in64 'labels'
135 |   context feature and a fixed length byte-quantized feature vector, obtained
136 |   from the features in 'feature_names'. The quantized features will be mapped
137 |   back into a range between min_quantized_value and max_quantized_value.
138 |   """
139 | 
140 |   def __init__(self,
141 |                num_classes=4716,
142 |                feature_sizes=[1024],
143 |                feature_names=["inc3"],
144 |                max_frames=300):
145 |     """Construct a YT8MFrameFeatureReader.
146 | 
147 |     Args:
148 |       num_classes: a positive integer for the number of classes.
149 |       feature_sizes: positive integer(s) for the feature dimensions as a list.
150 |       feature_names: the feature name(s) in the tensorflow record as a list.
151 |       max_frames: the maximum number of frames to process.
152 |     """
153 | 
154 |     assert len(feature_names) == len(feature_sizes), \
155 |     "length of feature_names (={}) != length of feature_sizes (={})".format( \
156 |     len(feature_names), len(feature_sizes))
157 | 
158 |     self.num_classes = num_classes
159 |     self.feature_sizes = feature_sizes
160 |     self.feature_names = feature_names
161 |     self.max_frames = max_frames
162 | 
163 |   def get_video_matrix(self,
164 |                        features,
165 |                        feature_size,
166 |                        max_frames,
167 |                        max_quantized_value,
168 |                        min_quantized_value):
169 |     """Decodes features from an input string and quantizes it.
170 | 
171 |     Args:
172 |       features: raw feature values
173 |       feature_size: length of each frame feature vector
174 |       max_frames: number of frames (rows) in the output feature_matrix
175 |       max_quantized_value: the maximum of the quantized value.
176 |       min_quantized_value: the minimum of the quantized value.
177 | 
178 |     Returns:
179 |       feature_matrix: matrix of all frame-features
180 |       num_frames: number of frames in the sequence
181 |     """
182 |     decoded_features = tf.reshape(
183 |         tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),
184 |         [-1, feature_size])
185 | 
186 |     num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)
187 |     feature_matrix = utils.Dequantize(decoded_features,
188 |                                       max_quantized_value,
189 |                                       min_quantized_value)
190 |     feature_matrix = resize_axis(feature_matrix, 0, max_frames)
191 |     return feature_matrix, num_frames
192 | 
193 |   def prepare_reader(self,
194 |                      filename_queue,
195 |                      max_quantized_value=2,
196 |                      min_quantized_value=-2):
197 |     """Creates a single reader thread for YouTube8M SequenceExamples.
198 | 
199 |     Args:
200 |       filename_queue: A tensorflow queue of filename locations.
201 |       max_quantized_value: the maximum of the quantized value.
202 |       min_quantized_value: the minimum of the quantized value.
203 | 
204 |     Returns:
205 |       A tuple of video indexes, video features, labels, and padding data.
206 |     """
207 |     reader = tf.TFRecordReader()
208 |     _, serialized_example = reader.read(filename_queue)
209 | 
210 |     return self.prepare_serialized_examples(serialized_example,
211 |         max_quantized_value, min_quantized_value)
212 | 
213 |   def prepare_serialized_examples(self, serialized_example,
214 |       max_quantized_value=2, min_quantized_value=-2):
215 | 
216 |     contexts, features = tf.parse_single_sequence_example(
217 |         serialized_example,
218 |         context_features={"video_id": tf.FixedLenFeature(
219 |             [], tf.string),
220 |                           "labels": tf.VarLenFeature(tf.int64)},
221 |         sequence_features={
222 |             feature_name : tf.FixedLenSequenceFeature([], dtype=tf.string)
223 |             for feature_name in self.feature_names
224 |         })
225 | 
226 |     # read ground truth labels
227 |     labels = (tf.cast(
228 |         tf.sparse_to_dense(contexts["labels"].values, (self.num_classes,), 1,
229 |             validate_indices=False),
230 |         tf.bool))
231 | 
232 |     # loads (potentially) different types of features and concatenates them
233 |     num_features = len(self.feature_names)
234 |     assert num_features > 0, "No feature selected: feature_names is empty!"
235 | 
236 |     assert len(self.feature_names) == len(self.feature_sizes), \
237 |     "length of feature_names (={}) != length of feature_sizes (={})".format( \
238 |     len(self.feature_names), len(self.feature_sizes))
239 | 
240 |     num_frames = -1  # the number of frames in the video
241 |     feature_matrices = [None] * num_features  # an array of different features
242 |     for feature_index in range(num_features):
243 |       feature_matrix, num_frames_in_this_feature = self.get_video_matrix(
244 |           features[self.feature_names[feature_index]],
245 |           self.feature_sizes[feature_index],
246 |           self.max_frames,
247 |           max_quantized_value,
248 |           min_quantized_value)
249 |       if num_frames == -1:
250 |         num_frames = num_frames_in_this_feature
251 |       else:
252 |         tf.assert_equal(num_frames, num_frames_in_this_feature)
253 | 
254 |       feature_matrices[feature_index] = feature_matrix
255 | 
256 |     # cap the number of frames at self.max_frames
257 |     num_frames = tf.minimum(num_frames, self.max_frames)
258 | 
259 |     # concatenate different features
260 |     video_matrix = tf.concat(feature_matrices, 1)
261 | 
262 |     # convert to batch format.
263 |     # TODO: Do proper batch reads to remove the IO bottleneck.
264 |     batch_video_ids = tf.expand_dims(contexts["video_id"], 0)
265 |     batch_video_matrix = tf.expand_dims(video_matrix, 0)
266 |     batch_labels = tf.expand_dims(labels, 0)
267 |     batch_frames = tf.expand_dims(num_frames, 0)
268 | 
269 |     return batch_video_ids, batch_video_matrix, batch_labels, batch_frames
270 | 
271 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Binary for evaluating Tensorflow models on the YouTube-8M dataset."""
 15 | 
 16 | import time
 17 | 
 18 | import eval_util
 19 | import losses
 20 | import frame_level_models
 21 | import video_level_models
 22 | import readers
 23 | import tensorflow as tf
 24 | from tensorflow import app
 25 | from tensorflow import flags
 26 | from tensorflow import gfile
 27 | from tensorflow import logging
 28 | import utils
 29 | 
 30 | FLAGS = flags.FLAGS
 31 | 
 32 | if __name__ == "__main__":
 33 |   # Dataset flags.
 34 |   flags.DEFINE_string("train_dir", "/tmp/yt8m_model/",
 35 |                       "The directory to load the model files from. "
 36 |                       "The tensorboard metrics files are also saved to this "
 37 |                       "directory.")
 38 |   flags.DEFINE_string(
 39 |       "eval_data_pattern", "",
 40 |       "File glob defining the evaluation dataset in tensorflow.SequenceExample "
 41 |       "format. The SequenceExamples are expected to have an 'rgb' byte array "
 42 |       "sequence feature as well as a 'labels' int64 context feature.")
 43 |   flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature "
 44 |                       "to use for training.")
 45 |   flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.")
 46 |   flags.DEFINE_integer("num_classes", 4716, "Number of classes in dataset.")
 47 | 
 48 |   # Model flags.
 49 |   flags.DEFINE_bool(
 50 |       "frame_features", False,
 51 |       "If set, then --eval_data_pattern must be frame-level features. "
 52 |       "Otherwise, --eval_data_pattern must be aggregated video-level "
 53 |       "features. The model must also be set appropriately (i.e. to read 3D "
 54 |       "batches VS 4D batches.")
 55 |   flags.DEFINE_string(
 56 |       "model", "LogisticModel",
 57 |       "Which architecture to use for the model. Options include 'Logistic', "
 58 |       "'SingleMixtureMoe', and 'TwoLayerSigmoid'. See aggregated_models.py and "
 59 |       "frame_level_models.py for the model definitions.")
 60 |   flags.DEFINE_integer("batch_size", 1024,
 61 |                        "How many examples to process per batch.")
 62 |   flags.DEFINE_string("label_loss", "CrossEntropyLoss",
 63 |                       "Loss computed on validation data")
 64 | 
 65 |   # Other flags.
 66 |   flags.DEFINE_integer("num_readers", 8,
 67 |                        "How many threads to use for reading input files.")
 68 |   flags.DEFINE_boolean("run_once", False, "Whether to run eval only once.")
 69 |   flags.DEFINE_integer("top_k", 20, "How many predictions to output per video.")
 70 | 
 71 | 
 72 | def find_class_by_name(name, modules):
 73 |   """Searches the provided modules for the named class and returns it."""
 74 |   modules = [getattr(module, name, None) for module in modules]
 75 |   return next(a for a in modules if a)
 76 | 
 77 | 
 78 | def get_input_evaluation_tensors(reader,
 79 |                                  data_pattern,
 80 |                                  batch_size=1024,
 81 |                                  num_readers=1):
 82 |   """Creates the section of the graph which reads the evaluation data.
 83 | 
 84 |   Args:
 85 |     reader: A class which parses the training data.
 86 |     data_pattern: A 'glob' style path to the data files.
 87 |     batch_size: How many examples to process at a time.
 88 |     num_readers: How many I/O threads to use.
 89 | 
 90 |   Returns:
 91 |     A tuple containing the features tensor, labels tensor, and optionally a
 92 |     tensor containing the number of frames per video. The exact dimensions
 93 |     depend on the reader being used.
 94 | 
 95 |   Raises:
 96 |     IOError: If no files matching the given pattern were found.
 97 |   """
 98 |   logging.info("Using batch size of " + str(batch_size) + " for evaluation.")
 99 |   with tf.name_scope("eval_input"):
100 |     files = gfile.Glob(data_pattern)
101 |     if not files:
102 |       raise IOError("Unable to find the evaluation files.")
103 |     logging.info("number of evaluation files: " + str(len(files)))
104 |     filename_queue = tf.train.string_input_producer(
105 |         files, shuffle=False, num_epochs=1)
106 |     eval_data = [
107 |         reader.prepare_reader(filename_queue) for _ in range(num_readers)
108 |     ]
109 |     return tf.train.batch_join(
110 |         eval_data,
111 |         batch_size=batch_size,
112 |         capacity=3 * batch_size,
113 |         allow_smaller_final_batch=True,
114 |         enqueue_many=True)
115 | 
116 | 
117 | def build_graph(reader,
118 |                 model,
119 |                 eval_data_pattern,
120 |                 label_loss_fn,
121 |                 batch_size=1024,
122 |                 num_readers=1):
123 |   """Creates the Tensorflow graph for evaluation.
124 | 
125 |   Args:
126 |     reader: The data file reader. It should inherit from BaseReader.
127 |     model: The core model (e.g. logistic or neural net). It should inherit
128 |            from BaseModel.
129 |     eval_data_pattern: glob path to the evaluation data files.
130 |     label_loss_fn: What kind of loss to apply to the model. It should inherit
131 |                 from BaseLoss.
132 |     batch_size: How many examples to process at a time.
133 |     num_readers: How many threads to use for I/O operations.
134 |   """
135 | 
136 |   global_step = tf.Variable(0, trainable=False, name="global_step")
137 |   video_id_batch, model_input_raw, labels_batch, num_frames = get_input_evaluation_tensors(  # pylint: disable=g-line-too-long
138 |       reader,
139 |       eval_data_pattern,
140 |       batch_size=batch_size,
141 |       num_readers=num_readers)
142 |   tf.summary.histogram("model_input_raw", model_input_raw)
143 | 
144 |   feature_dim = len(model_input_raw.get_shape()) - 1
145 | 
146 |   # Normalize input features.
147 |   model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
148 | 
149 |   with tf.variable_scope("tower"):
150 |     result = model.create_model(model_input,
151 |                                 num_frames=num_frames,
152 |                                 vocab_size=reader.num_classes,
153 |                                 labels=labels_batch,
154 |                                 is_training=False)
155 |     predictions = result["predictions"]
156 |     tf.summary.histogram("model_activations", predictions)
157 |     if "loss" in result.keys():
158 |       label_loss = result["loss"]
159 |     else:
160 |       label_loss = label_loss_fn.calculate_loss(predictions, labels_batch)
161 | 
162 |   tf.add_to_collection("global_step", global_step)
163 |   tf.add_to_collection("loss", label_loss)
164 |   tf.add_to_collection("predictions", predictions)
165 |   tf.add_to_collection("input_batch", model_input)
166 |   tf.add_to_collection("video_id_batch", video_id_batch)
167 |   tf.add_to_collection("num_frames", num_frames)
168 |   tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
169 |   tf.add_to_collection("summary_op", tf.summary.merge_all())
170 | 
171 | 
172 | def evaluation_loop(video_id_batch, prediction_batch, label_batch, loss,
173 |                     summary_op, saver, summary_writer, evl_metrics,
174 |                     last_global_step_val):
175 |   """Run the evaluation loop once.
176 | 
177 |   Args:
178 |     video_id_batch: a tensor of video ids mini-batch.
179 |     prediction_batch: a tensor of predictions mini-batch.
180 |     label_batch: a tensor of label_batch mini-batch.
181 |     loss: a tensor of loss for the examples in the mini-batch.
182 |     summary_op: a tensor which runs the tensorboard summary operations.
183 |     saver: a tensorflow saver to restore the model.
184 |     summary_writer: a tensorflow summary_writer
185 |     evl_metrics: an EvaluationMetrics object.
186 |     last_global_step_val: the global step used in the previous evaluation.
187 | 
188 |   Returns:
189 |     The global_step used in the latest model.
190 |   """
191 | 
192 |   global_step_val = -1
193 |   with tf.Session() as sess:
194 |     latest_checkpoint = tf.train.latest_checkpoint(FLAGS.train_dir)
195 |     if latest_checkpoint:
196 |       logging.info("Loading checkpoint for eval: " + latest_checkpoint)
197 |       # Restores from checkpoint
198 |       saver.restore(sess, latest_checkpoint)
199 |       # Assuming model_checkpoint_path looks something like:
200 |       # /my-favorite-path/yt8m_train/model.ckpt-0, extract global_step from it.
201 |       global_step_val = latest_checkpoint.split("/")[-1].split("-")[-1]
202 |     else:
203 |       logging.info("No checkpoint file found.")
204 |       return global_step_val
205 | 
206 |     if global_step_val == last_global_step_val:
207 |       logging.info("skip this checkpoint global_step_val=%s "
208 |                    "(same as the previous one).", global_step_val)
209 |       return global_step_val
210 | 
211 |     sess.run([tf.local_variables_initializer()])
212 | 
213 |     # Start the queue runners.
214 |     fetches = [video_id_batch, prediction_batch, label_batch, loss, summary_op]
215 |     coord = tf.train.Coordinator()
216 |     try:
217 |       threads = []
218 |       for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
219 |         threads.extend(qr.create_threads(
220 |             sess, coord=coord, daemon=True,
221 |             start=True))
222 |       logging.info("enter eval_once loop global_step_val = %s. ",
223 |                    global_step_val)
224 | 
225 |       evl_metrics.clear()
226 | 
227 |       examples_processed = 0
228 |       while not coord.should_stop():
229 |         batch_start_time = time.time()
230 |         _, predictions_val, labels_val, loss_val, summary_val = sess.run(
231 |             fetches)
232 |         seconds_per_batch = time.time() - batch_start_time
233 |         example_per_second = labels_val.shape[0] / seconds_per_batch
234 |         examples_processed += labels_val.shape[0]
235 | 
236 |         iteration_info_dict = evl_metrics.accumulate(predictions_val,
237 |                                                      labels_val, loss_val)
238 |         iteration_info_dict["examples_per_second"] = example_per_second
239 | 
240 |         iterinfo = utils.AddGlobalStepSummary(
241 |             summary_writer,
242 |             global_step_val,
243 |             iteration_info_dict,
244 |             summary_scope="Eval")
245 |         logging.info("examples_processed: %d | %s", examples_processed,
246 |                      iterinfo)
247 | 
248 |     except tf.errors.OutOfRangeError as e:
249 |       logging.info(
250 |           "Done with batched inference. Now calculating global performance "
251 |           "metrics.")
252 |       # calculate the metrics for the entire epoch
253 |       epoch_info_dict = evl_metrics.get()
254 |       epoch_info_dict["epoch_id"] = global_step_val
255 | 
256 |       summary_writer.add_summary(summary_val, global_step_val)
257 |       epochinfo = utils.AddEpochSummary(
258 |           summary_writer,
259 |           global_step_val,
260 |           epoch_info_dict,
261 |           summary_scope="Eval")
262 |       logging.info(epochinfo)
263 |       evl_metrics.clear()
264 |     except Exception as e:  # pylint: disable=broad-except
265 |       logging.info("Unexpected exception: " + str(e))
266 |       coord.request_stop(e)
267 | 
268 |     coord.request_stop()
269 |     coord.join(threads, stop_grace_period_secs=10)
270 | 
271 |     return global_step_val
272 | 
273 | 
274 | def evaluate():
275 |   tf.set_random_seed(0)  # for reproducibility
276 |   with tf.Graph().as_default():
277 |     # convert feature_names and feature_sizes to lists of values
278 |     feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes(
279 |         FLAGS.feature_names, FLAGS.feature_sizes)
280 |     num_classes = FLAGS.num_classes
281 | 
282 |     if FLAGS.frame_features:
283 |       reader = readers.YT8MFrameFeatureReader(
284 |           num_classes=num_classes,
285 |           feature_names=feature_names,feature_sizes=feature_sizes)
286 |     else:
287 |       reader = readers.YT8MAggregatedFeatureReader(
288 |           num_classes=num_classes,
289 |           feature_names=feature_names, feature_sizes=feature_sizes)
290 | 
291 |     model = find_class_by_name(FLAGS.model,
292 |         [frame_level_models, video_level_models])()
293 |     label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])()
294 | 
295 |     if FLAGS.eval_data_pattern is "":
296 |       raise IOError("'eval_data_pattern' was not specified. " +
297 |                      "Nothing to evaluate.")
298 | 
299 |     build_graph(
300 |         reader=reader,
301 |         model=model,
302 |         eval_data_pattern=FLAGS.eval_data_pattern,
303 |         label_loss_fn=label_loss_fn,
304 |         num_readers=FLAGS.num_readers,
305 |         batch_size=FLAGS.batch_size)
306 |     logging.info("built evaluation graph")
307 |     video_id_batch = tf.get_collection("video_id_batch")[0]
308 |     prediction_batch = tf.get_collection("predictions")[0]
309 |     label_batch = tf.get_collection("labels")[0]
310 |     loss = tf.get_collection("loss")[0]
311 |     summary_op = tf.get_collection("summary_op")[0]
312 | 
313 |     saver = tf.train.Saver(tf.global_variables())
314 |     summary_writer = tf.summary.FileWriter(
315 |         FLAGS.train_dir, graph=tf.get_default_graph())
316 | 
317 |     evl_metrics = eval_util.EvaluationMetrics(reader.num_classes, FLAGS.top_k)
318 | 
319 |     last_global_step_val = -1
320 |     while True:
321 |       last_global_step_val = evaluation_loop(video_id_batch, prediction_batch,
322 |                                              label_batch, loss, summary_op,
323 |                                              saver, summary_writer, evl_metrics,
324 |                                              last_global_step_val)
325 |       if FLAGS.run_once:
326 |         break
327 | 
328 | 
329 | def main(unused_argv):
330 |   logging.set_verbosity(tf.logging.INFO)
331 |   print("tensorflow version: %s" % tf.__version__)
332 |   evaluate()
333 | 
334 | 
335 | if __name__ == "__main__":
336 |   app.run()
337 | 
338 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # YouTube-8M Tensorflow Starter Code
  2 | 
  3 | This repo contains starter code for training and evaluating machine learning
  4 | models over the [YouTube-8M](https://research.google.com/youtube8m/) dataset.
  5 | The code gives an end-to-end working example for reading the dataset, training a
  6 | TensorFlow model, and evaluating the performance of the model. Out of the box,
  7 | you can train several [model architectures](#overview-of-models) over either
  8 | frame-level or video-level features. The code can easily be extended to train
  9 | your own custom-defined models.
 10 | 
 11 | It is possible to train and evaluate on YouTube-8M in two ways: on Google Cloud
 12 | or on your own machine. This README provides instructions for both.
 13 | 
 14 | ## Table of Contents
 15 | * [Running on Google's Cloud Machine Learning Platform](#running-on-googles-cloud-machine-learning-platform)
 16 |    * [Requirements](#requirements)
 17 |    * [Testing Locally](#testing-locally)
 18 |    * [Training on the Cloud over Video-Level Features](#training-on-video-level-features)
 19 |    * [Evaluation and Inference](#evaluation-and-inference)
 20 |    * [Accessing Files on Google Cloud](#accessing-files-on-google-cloud)
 21 |    * [Using Frame-Level Features](#using-frame-level-features)
 22 |    * [Using Audio Features](#using-audio-features)
 23 |    * [Using Larger Machine Types](#using-larger-machine-types)
 24 | * [Running on Your Own Machine](#running-on-your-own-machine)
 25 |    * [Requirements](#requirements-1)
 26 |    * [Training on Video-Level Features](#training-on-video-level-features-1)
 27 |    * [Evaluation and Inference](#evaluation-and-inference-1)
 28 |    * [Using Frame-Level Features](#using-frame-level-features-1)
 29 |    * [Using Audio Features](#using-audio-features-1)
 30 |    * [Using GPUs](#using-gpus)
 31 |    * [Ground-Truth Label Files](#ground-truth-label-files)
 32 | * [Overview of Models](#overview-of-models)
 33 |    * [Video-Level Models](#video-level-models)
 34 |    * [Frame-Level Models](#frame-level-models)
 35 | * [Create Your Own Dataset Files](#create-your-own-dataset-files)
 36 | * [Overview of Files](#overview-of-files)
 37 |    * [Training](#training)
 38 |    * [Evaluation](#evaluation)
 39 |    * [Inference](#inference)
 40 |    * [Misc](#misc)
 41 | * [About This Project](#about-this-project)
 42 | 
 43 | ## Running on Google's Cloud Machine Learning Platform
 44 | 
 45 | ### Requirements
 46 | 
 47 | This option requires you to have an appropriately configured Google Cloud
 48 | Platform account. To create and configure your account, please make sure you
 49 | follow the instructions [here](https://cloud.google.com/ml/docs/how-tos/getting-set-up).
 50 | If you are participating in the Google Cloud & YouTube-8M Video Understanding
 51 | Challenge hosted on [kaggle](https://www.kaggle.com/c/youtube8m), see [these instructions](https://www.kaggle.com/c/youtube8m#getting-started-with-google-cloud) instead.
 52 | 
 53 | Please also verify that you have Python 2.7+ and Tensorflow 1.0.0 or higher
 54 | installed by running the following commands:
 55 | 
 56 | ```sh
 57 | python --version
 58 | python -c 'import tensorflow as tf; print(tf.__version__)'
 59 | ```
 60 | 
 61 | ### Testing Locally
 62 | All gcloud commands should be done from the directory *immediately above* the
 63 | source code. You should be able to see the source code directory if you
 64 | run 'ls'.
 65 | 
 66 | As you are developing your own models, you will want to test them
 67 | quickly to flush out simple problems without having to submit them to the cloud.
 68 | You can use the `gcloud beta ml local` set of commands for that.
 69 | 
 70 | Here is an example command line for video-level training:
 71 | 
 72 | ```sh
 73 | gcloud ml-engine local train \
 74 | --package-path=youtube-8m --module-name=youtube-8m.train -- \
 75 | --train_data_pattern='gs://youtube8m-ml/1/video_level/train/train*.tfrecord' \
 76 | --train_dir=/tmp/yt8m_train --model=LogisticModel --start_new_model
 77 | ```
 78 | 
 79 | You might want to download some training shards locally to speed things up and
 80 | allow you to work offline. The command below will copy 10 out of the 4096
 81 | training data files to the current directory.
 82 | 
 83 | ```sh
 84 | # Downloads 55MB of data.
 85 | gsutil cp gs://us.data.yt8m.org/1/video_level/train/traina[0-9].tfrecord .
 86 | ```
 87 | Once you download the files, you can point the job to them using the
 88 | 'train_data_pattern' argument (i.e. instead of pointing to the "gs://..."
 89 | files, you point to the local files).
 90 | 
 91 | Once your model is working locally, you can scale up on the Cloud
 92 | which is described below.
 93 | 
 94 | ### Training on the Cloud over Video-Level Features
 95 | 
 96 | The following commands will train a model on Google Cloud
 97 | over video-level features.
 98 | 
 99 | ```sh
100 | BUCKET_NAME=gs://${USER}_yt8m_train_bucket
101 | # (One Time) Create a storage bucket to store training logs and checkpoints.
102 | gsutil mb -l us-east1 $BUCKET_NAME
103 | # Submit the training job.
104 | JOB_NAME=yt8m_train_$(date +%Y%m%d_%H%M%S); gcloud --verbosity=debug ml-engine jobs \
105 | submit training $JOB_NAME \
106 | --package-path=youtube-8m --module-name=youtube-8m.train \
107 | --staging-bucket=$BUCKET_NAME --region=us-east1 \
108 | --config=youtube-8m/cloudml-gpu.yaml \
109 | -- --train_data_pattern='gs://youtube8m-ml-us-east1/1/video_level/train/train*.tfrecord' \
110 | --model=LogisticModel \
111 | --train_dir=$BUCKET_NAME/yt8m_train_video_level_logistic_model
112 | ```
113 | 
114 | In the 'gsutil' command above, the 'package-path' flag refers to the directory
115 | containing the 'train.py' script and more generally the python package which
116 | should be deployed to the cloud worker. The module-name refers to the specific
117 | python script which should be executed (in this case the train module).
118 | 
119 | It may take several minutes before the job starts running on Google Cloud.
120 | When it starts you will see outputs like the following:
121 | 
122 | ```
123 | training step 270| Hit@1: 0.68 PERR: 0.52 Loss: 638.453
124 | training step 271| Hit@1: 0.66 PERR: 0.49 Loss: 635.537
125 | training step 272| Hit@1: 0.70 PERR: 0.52 Loss: 637.564
126 | ```
127 | 
128 | At this point you can disconnect your console by pressing "ctrl-c". The
129 | model will continue to train indefinitely in the Cloud. Later, you can check
130 | on its progress or halt the job by visiting the
131 | [Google Cloud ML Jobs console](https://console.cloud.google.com/ml/jobs).
132 | 
133 | You can train many jobs at once and use tensorboard to compare their performance
134 | visually.
135 | 
136 | ```sh
137 | tensorboard --logdir=$BUCKET_NAME --port=8080
138 | ```
139 | 
140 | Once tensorboard is running, you can access it at the following url:
141 | [http://localhost:8080](http://localhost:8080).
142 | If you are using Google Cloud Shell, you can instead click the Web Preview button
143 | on the upper left corner of the Cloud Shell window and select "Preview on port 8080".
144 | This will bring up a new browser tab with the Tensorboard view.
145 | 
146 | ### Evaluation and Inference
147 | Here's how to evaluate a model on the validation dataset:
148 | 
149 | ```sh
150 | JOB_TO_EVAL=yt8m_train_video_level_logistic_model
151 | JOB_NAME=yt8m_eval_$(date +%Y%m%d_%H%M%S); gcloud --verbosity=debug ml-engine jobs \
152 | submit training $JOB_NAME \
153 | --package-path=youtube-8m --module-name=youtube-8m.eval \
154 | --staging-bucket=$BUCKET_NAME --region=us-east1 \
155 | --config=youtube-8m/cloudml-gpu.yaml \
156 | -- --eval_data_pattern='gs://youtube8m-ml-us-east1/1/video_level/validate/validate*.tfrecord' \
157 | --model=LogisticModel \
158 | --train_dir=$BUCKET_NAME/${JOB_TO_EVAL} --run_once=True
159 | ```
160 | 
161 | And here's how to perform inference with a model on the test set:
162 | 
163 | ```sh
164 | JOB_TO_EVAL=yt8m_train_video_level_logistic_model
165 | JOB_NAME=yt8m_inference_$(date +%Y%m%d_%H%M%S); gcloud --verbosity=debug ml-engine jobs \
166 | submit training $JOB_NAME \
167 | --package-path=youtube-8m --module-name=youtube-8m.inference \
168 | --staging-bucket=$BUCKET_NAME --region=us-east1 \
169 | --config=youtube-8m/cloudml-gpu.yaml \
170 | -- --input_data_pattern='gs://youtube8m-ml/1/video_level/test/test*.tfrecord' \
171 | --train_dir=$BUCKET_NAME/${JOB_TO_EVAL} \
172 | --output_file=$BUCKET_NAME/${JOB_TO_EVAL}/predictions.csv
173 | ```
174 | 
175 | Note the confusing use of 'training' in the above gcloud commands. Despite the
176 | name, the 'training' argument really just offers a cloud hosted
177 | python/tensorflow service. From the point of view of the Cloud Platform, there
178 | is no distinction between our training and inference jobs. The Cloud ML platform
179 | also offers specialized functionality for prediction with
180 | Tensorflow models, but discussing that is beyond the scope of this readme.
181 | 
182 | Once these job starts executing you will see outputs similar to the
183 | following for the evaluation code:
184 | 
185 | ```
186 | examples_processed: 1024 | global_step 447044 | Batch Hit@1: 0.782 | Batch PERR: 0.637 | Batch Loss: 7.821 | Examples_per_sec: 834.658
187 | ```
188 | 
189 | and the following for the inference code:
190 | 
191 | ```
192 | num examples processed: 8192 elapsed seconds: 14.85
193 | ```
194 | 
195 | ### Accessing Files on Google Cloud
196 | 
197 | You can browse the storage buckets you created on Google Cloud, for example, to
198 | access the trained models, prediction CSV files, etc. by visiting the
199 | [Google Cloud storage browser](https://console.cloud.google.com/storage/browser).
200 | 
201 | Alternatively, you can use the 'gsutil' command to download the files directly.
202 | For example, to download the output of the inference code from the previous
203 | section to your local machine, run:
204 | 
205 | 
206 | ```
207 | gsutil cp $BUCKET_NAME/${JOB_TO_EVAL}/predictions.csv .
208 | ```
209 | 
210 | ### Using Frame-Level Features
211 | 
212 | Append
213 | ```sh
214 | --frame_features=True --model=FrameLevelLogisticModel --feature_names="rgb" \
215 | --feature_sizes="1024" --batch_size=128 \
216 | --train_dir=$BUCKET_NAME/yt8m_train_frame_level_logistic_model
217 | ```
218 | 
219 | to the 'gcloud' commands given above, and change 'video_level' in paths to
220 | 'frame_level'. Here is a sample command to kick-off a frame-level job:
221 | 
222 | ```sh
223 | JOB_NAME=yt8m_train_$(date +%Y%m%d_%H%M%S); gcloud --verbosity=debug ml-engine jobs \
224 | submit training $JOB_NAME \
225 | --package-path=youtube-8m --module-name=youtube-8m.train \
226 | --staging-bucket=$BUCKET_NAME --region=us-east1 \
227 | --config=youtube-8m/cloudml-gpu.yaml \
228 | -- --train_data_pattern='gs://youtube8m-ml-us-east1/1/frame_level/train/train*.tfrecord' \
229 | --frame_features=True --model=FrameLevelLogisticModel --feature_names="rgb" \
230 | --feature_sizes="1024" --batch_size=128 \
231 | --train_dir=$BUCKET_NAME/yt8m_train_frame_level_logistic_model
232 | ```
233 | 
234 | The 'FrameLevelLogisticModel' is designed to provide equivalent results to a
235 | logistic model trained over the video-level features. Please look at the
236 | 'video_level_models.py' or 'frame_level_models.py' files to see how to implement
237 | your own models.
238 | 
239 | 
240 | ### Using Audio Features
241 | 
242 | The feature files (both Frame-Level and Video-Level) contain two sets of
243 | features: 1) visual and 2) audio. The code defaults to using the visual
244 | features only, but it is possible to use audio features instead of (or besides)
245 | visual features. To specify the (combination of) features to use you must set
246 | `--feature_names` and `--feature_sizes` flags. The visual and audio features are
247 | called 'rgb' and 'audio' and have 1024 and 128 dimensions, respectively.
248 | The two flags take a comma-separated list of values in string. For example, to
249 | use audio-visual Video-Level features the flags must be set as follows:
250 | 
251 | ```
252 | --feature_names="mean_rgb, mean_audio" --feature_sizes="1024, 128"
253 | ```
254 | 
255 | Similarly, to use audio-visual Frame-Level features use:
256 | 
257 | ```
258 | --feature_names="rgb, audio" --feature_sizes="1024, 128"
259 | ```
260 | 
261 | **NOTE:** Make sure the set of features and the order in which the appear in the
262 | lists provided to the two flags above match. Also, the order must match when
263 | running training, evaluation, or inference.
264 | 
265 | ### Using Larger Machine Types
266 | 
267 | Some complex frame-level models can take as long as a week to converge when
268 | using only one GPU. You can train these models more quickly by using more
269 | powerful machine types which have additional GPUs. To use a configuration with
270 | 4 GPUs, replace the argument to `--config` with `youtube-8m/cloudml-4gpu.yaml`.
271 | Be careful with this argument as it will also increase the rate you are charged
272 | by a factor of 4 as well.
273 | 
274 | ## Running on Your Own Machine
275 | 
276 | ### Requirements
277 | 
278 | The starter code requires Tensorflow. If you haven't installed it yet, follow
279 | the instructions on [tensorflow.org](https://www.tensorflow.org/install/).
280 | This code has been tested with Tensorflow 1.0.0. Going forward, we will continue
281 | to target the latest released version of Tensorflow.
282 | 
283 | Please verify that you have Python 2.7+ and Tensorflow 1.0.0 or higher
284 | installed by running the following commands:
285 | 
286 | ```sh
287 | python --version
288 | python -c 'import tensorflow as tf; print(tf.__version__)'
289 | ```
290 | 
291 | You can find complete instructions for downloading the dataset on the
292 | [YouTube-8M website](https://research.google.com/youtube8m/download.html).
293 | We recommend downloading the smaller video-level features dataset first when
294 | getting started. To do that, run:
295 | 
296 | ```
297 | mkdir -p features; cd features
298 | curl data.yt8m.org/download.py | partition=1/video_level/train mirror=us python
299 | ```
300 | 
301 | This will download the full set of video level features, which takes up 31GB
302 | of space.
303 | If you are located outside of North America, you should change the flag 'mirror'
304 | to 'eu' for Europe or 'asia' for Asia to speed up the transfer of the files.
305 | 
306 | Change 'train' to 'validate'/'test' and re-run the command to download the
307 | other splits of the dataset.
308 | 
309 | Change 'video_level' to 'frame_level' to download the frame-level features. The
310 | complete frame-level features take about 1.71TB of space. You can set the
311 | environment variable 'shard' to 'm,n' to download only m/n-th of the data. For
312 | example, to download 1/100-th of the frame-level features from the training set,
313 | run:
314 | 
315 | ```
316 | curl data.yt8m.org/download.py | shard=1,100 partition=1/frame_level/train mirror=us python
317 | ```
318 | 
319 | ### Training on Video-Level Features
320 | 
321 | To start training a logistic model on the video-level features, run
322 | 
323 | ```sh
324 | MODEL_DIR=/tmp/yt8m
325 | python train.py --train_data_pattern='/path/to/features/train*.tfrecord' --model=LogisticModel --train_dir=$MODEL_DIR/video_level_logistic_model
326 | ```
327 | 
328 | Since the dataset is sharded into 4096 individual files, we use a wildcard (\*)
329 | to represent all of those files.
330 | 
331 | By default, the training code will frequently write _checkpoint_ files (i.e.
332 | values of all trainable parameters, at the current training iteration). These
333 | will be written to the `--train_dir`. If you re-use a `--train_dir`, the trainer
334 | will first restore the latest checkpoint written in that directory. This only
335 | works if the architecture of the checkpoint matches the graph created by the
336 | training code. If you are in active development/debugging phase, consider
337 | adding `--start_new_model` flag to your run configuration.
338 | 
339 | ### Evaluation and Inference
340 | 
341 | To evaluate the model, run
342 | 
343 | ```sh
344 | python eval.py --eval_data_pattern='/path/to/features/validate*.tfrecord' --model=LogisticModel --train_dir=$MODEL_DIR/video_level_logistic_model --run_once=True
345 | ```
346 | 
347 | As the model is training or evaluating, you can view the results on tensorboard
348 | by running
349 | 
350 | ```sh
351 | tensorboard --logdir=$MODEL_DIR
352 | ```
353 | 
354 | and navigating to http://localhost:6006 in your web browser.
355 | 
356 | When you are happy with your model, you can generate a csv file of predictions
357 | from it by running
358 | 
359 | ```sh
360 | python inference.py --output_file=$MODEL_DIR/video_level_logistic_model/predictions.csv --input_data_pattern='/path/to/features/test*.tfrecord' --train_dir=$MODEL_DIR/video_level_logistic_model
361 | ```
362 | 
363 | This will output the top 20 predicted labels from the model for every example
364 | to 'predictions.csv'.
365 | 
366 | ### Using Frame-Level Features
367 | 
368 | Follow the same instructions as above, appending
369 | `--frame_features=True --model=FrameLevelLogisticModel --feature_names="rgb"
370 | --feature_sizes="1024" --train_dir=$MODEL_DIR/frame_level_logistic_model`
371 | for the 'train.py', 'eval.py', and 'inference.py' scripts.
372 | 
373 | The 'FrameLevelLogisticModel' is designed to provide equivalent results to a
374 | logistic model trained over the video-level features. Please look at the
375 | 'models.py' file to see how to implement your own models.
376 | 
377 | ### Using Audio Features
378 | 
379 | See [Using Audio Features](#using-audio-features) section above.
380 | 
381 | ### Using GPUs
382 | 
383 | If your Tensorflow installation has GPU support, this code will make use of all
384 | of your compatible GPUs. You can verify your installation by running
385 | 
386 | ```
387 | python -c 'import tensorflow as tf; tf.Session()'
388 | ```
389 | 
390 | This will print out something like the following for each of your compatible
391 | GPUs.
392 | 
393 | ```
394 | I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties:
395 | name: Tesla M40
396 | major: 5 minor: 2 memoryClockRate (GHz) 1.112
397 | pciBusID 0000:04:00.0
398 | Total memory: 11.25GiB
399 | Free memory: 11.09GiB
400 | ...
401 | ```
402 | 
403 | If at least one GPU was found, the forward and backward passes will be computed
404 | with the GPUs, whereas the CPU will be used primarily for the input and output
405 | pipelines. If you have multiple GPUs, each of them will be given a full batch
406 | of examples, and the resulting gradients will be summed together before being
407 | applied. This will increase your effective batch size. For example, if you set
408 | `batch_size=128` and you have 4 GPUs, this will result in 512 examples being
409 | evaluated every training step.
410 | 
411 | ### Ground-Truth Label Files
412 | 
413 | We also provide CSV files containing the ground-truth label information of the
414 | 'train' and 'validation' partitions of the dataset. These files can be
415 | downloaded using 'gsutil' command:
416 | 
417 | ```
418 | gsutil cp gs://us.data.yt8m.org/1/ground_truth_labels/train_labels.csv /destination/folder/
419 | gsutil cp gs://us.data.yt8m.org/1/ground_truth_labels/validate_labels.csv /destination/folder/
420 | ```
421 | 
422 | or directly using the following links:
423 | 
424 | *   [http://us.data.yt8m.org/1/ground_truth_labels/train_labels.csv](http://us.data.yt8m.org/1/ground_truth_labels/train_labels.csv)
425 | *   [http://us.data.yt8m.org/1/ground_truth_labels/validate_labels.csv](http://us.data.yt8m.org/1/ground_truth_labels/validate_labels.csv)
426 | 
427 | Each line in the files starts with the video id and is followed by the list of
428 | ground-truth labels corresponding to that video. For example, for a video with
429 | id 'VIDEO_ID' and two labels 'LABEL1' and 'LABEL2' we store the following line:
430 | 
431 | ```
432 | VIDEO_ID,LABEL1 LABEL2
433 | ```
434 | 
435 | ## Overview of Models
436 | 
437 | This sample code contains implementations of the models given in the
438 | [YouTube-8M technical report](https://arxiv.org/abs/1609.08675).
439 | 
440 | ### Video-Level Models
441 | *   `LogisticModel`: Linear projection of the output features into the label
442 |                      space, followed by a sigmoid function to convert logit
443 |                      values to probabilities.
444 | *   `MoeModel`: A per-class softmax distribution over a configurable number of
445 |                 logistic classifiers. One of the classifiers in the mixture
446 |                 is not trained, and always predicts 0.
447 | 
448 | ### Frame-Level Models
449 | * `LstmModel`: Processes the features for each frame using a multi-layered
450 |                LSTM neural net. The final internal state of the LSTM
451 |                is input to a video-level model for classification. Note that
452 |                you will need to change the learning rate to 0.001 when using
453 |                this model.
454 | * `DbofModel`: Projects the features for each frame into a higher dimensional
455 |                'clustering' space, pools across frames in that space, and then
456 |                uses a video-level model to classify the now aggregated features.
457 | * `FrameLevelLogisticModel`: Equivalent to 'LogisticModel', but performs
458 |                              average-pooling on the fly over frame-level
459 |                              features rather than using pre-aggregated features.
460 | 
461 | ## Create Your Own Dataset Files
462 | You can create your dataset files from your own videos. Our
463 | [feature extractor](./feature_extractor) code creates `tfrecord`
464 | files, identical to our dataset files. You can use our starter code to train on
465 | the `tfrecord` files output by the feature extractor. In addition, you can
466 | fine-tune your YouTube-8M models on your new dataset. 
467 | 
468 | ## Overview of Files
469 | 
470 | ### Training
471 | *   `train.py`: The primary script for training models.
472 | *   `losses.py`: Contains definitions for loss functions.
473 | *   `models.py`: Contains the base class for defining a model.
474 | *   `video_level_models.py`: Contains definitions for models that take
475 |                              aggregated features as input.
476 | *   `frame_level_models.py`: Contains definitions for models that take frame-
477 |                              level features as input.
478 | *   `model_util.py`: Contains functions that are of general utility for
479 |                      implementing models.
480 | *   `export_model.py`: Provides a class to export a model during training
481 |                        for later use in batch prediction.
482 | *   `readers.py`: Contains definitions for the Video dataset and Frame
483 |                   dataset readers.
484 | 
485 | ### Evaluation
486 | *   `eval.py`: The primary script for evaluating models.
487 | *   `eval_util.py`: Provides a class that calculates all evaluation metrics.
488 | *   `average_precision_calculator.py`: Functions for calculating
489 |                                        average precision.
490 | *   `mean_average_precision_calculator.py`: Functions for calculating mean
491 |                                             average precision.
492 | 
493 | ### Inference
494 | *   `inference.py`: Generates an output file containing predictions of
495 |                     the model over a set of videos.
496 | 
497 | ### Misc
498 | *   `README.md`: This documentation.
499 | *   `utils.py`: Common functions.
500 | *   `convert_prediction_from_json_to_csv.py`: Converts the JSON output of
501 |         batch prediction into a CSV file for submission.
502 | 
503 | ## About This Project
504 | This project is meant help people quickly get started working with the
505 | [YouTube-8M](https://research.google.com/youtube8m/) dataset.
506 | This is not an official Google product.
507 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Binary for training Tensorflow models on the YouTube-8M dataset."""
 15 | 
 16 | import json
 17 | import os
 18 | import time
 19 | 
 20 | import eval_util
 21 | import export_model
 22 | import losses
 23 | import frame_level_models
 24 | import video_level_models
 25 | import readers
 26 | import tensorflow as tf
 27 | import tensorflow.contrib.slim as slim
 28 | from tensorflow import app
 29 | from tensorflow import flags
 30 | from tensorflow import gfile
 31 | from tensorflow import logging
 32 | from tensorflow.python.client import device_lib
 33 | import utils
 34 | 
 35 | FLAGS = flags.FLAGS
 36 | 
 37 | if __name__ == "__main__":
 38 |   # Dataset flags.
 39 |   flags.DEFINE_string("train_dir", "/tmp/yt8m_model/",
 40 |                       "The directory to save the model files in.")
 41 |   flags.DEFINE_string(
 42 |       "train_data_pattern", "",
 43 |       "File glob for the training dataset. If the files refer to Frame Level "
 44 |       "features (i.e. tensorflow.SequenceExample), then set --reader_type "
 45 |       "format. The (Sequence)Examples are expected to have 'rgb' byte array "
 46 |       "sequence feature as well as a 'labels' int64 context feature.")
 47 |   flags.DEFINE_string("feature_names", "mean_rgb", "Name of the feature "
 48 |                       "to use for training.")
 49 |   flags.DEFINE_string("feature_sizes", "1024", "Length of the feature vectors.")
 50 |   flags.DEFINE_integer("num_classes", 4716, "Number of classes in dataset.")
 51 | 
 52 |   # Model flags.
 53 |   flags.DEFINE_bool(
 54 |       "frame_features", False,
 55 |       "If set, then --train_data_pattern must be frame-level features. "
 56 |       "Otherwise, --train_data_pattern must be aggregated video-level "
 57 |       "features. The model must also be set appropriately (i.e. to read 3D "
 58 |       "batches VS 4D batches.")
 59 |   flags.DEFINE_string(
 60 |       "model", "LogisticModel",
 61 |       "Which architecture to use for the model. Models are defined "
 62 |       "in models.py.")
 63 |   flags.DEFINE_bool(
 64 |       "start_new_model", False,
 65 |       "If set, this will not resume from a checkpoint and will instead create a"
 66 |       " new model instance.")
 67 | 
 68 |   # Training flags.
 69 |   flags.DEFINE_integer("batch_size", 1024,
 70 |                        "How many examples to process per batch for training.")
 71 |   flags.DEFINE_string("label_loss", "CrossEntropyLoss",
 72 |                       "Which loss function to use for training the model.")
 73 |   flags.DEFINE_float(
 74 |       "regularization_penalty", 1.0,
 75 |       "How much weight to give to the regularization loss (the label loss has "
 76 |       "a weight of 1).")
 77 |   flags.DEFINE_float("base_learning_rate", 0.01,
 78 |                      "Which learning rate to start with.")
 79 |   flags.DEFINE_float("learning_rate_decay", 0.95,
 80 |                      "Learning rate decay factor to be applied every "
 81 |                      "learning_rate_decay_examples.")
 82 |   flags.DEFINE_float("learning_rate_decay_examples", 4000000,
 83 |                      "Multiply current learning rate by learning_rate_decay "
 84 |                      "every learning_rate_decay_examples.")
 85 |   flags.DEFINE_integer("num_epochs", 5,
 86 |                        "How many passes to make over the dataset before "
 87 |                        "halting training.")
 88 |   flags.DEFINE_integer("max_steps", None,
 89 |                        "The maximum number of iterations of the training loop.")
 90 |   flags.DEFINE_integer("export_model_steps", 1000,
 91 |                        "The period, in number of steps, with which the model "
 92 |                        "is exported for batch prediction.")
 93 | 
 94 |   # Other flags.
 95 |   flags.DEFINE_integer("num_readers", 8,
 96 |                        "How many threads to use for reading input files.")
 97 |   flags.DEFINE_string("optimizer", "AdamOptimizer",
 98 |                       "What optimizer class to use.")
 99 |   flags.DEFINE_float("clip_gradient_norm", 1.0, "Norm to clip gradients to.")
100 |   flags.DEFINE_bool(
101 |       "log_device_placement", False,
102 |       "Whether to write the device on which every op will run into the "
103 |       "logs on startup.")
104 | 
105 | def validate_class_name(flag_value, category, modules, expected_superclass):
106 |   """Checks that the given string matches a class of the expected type.
107 | 
108 |   Args:
109 |     flag_value: A string naming the class to instantiate.
110 |     category: A string used further describe the class in error messages
111 |               (e.g. 'model', 'reader', 'loss').
112 |     modules: A list of modules to search for the given class.
113 |     expected_superclass: A class that the given class should inherit from.
114 | 
115 |   Raises:
116 |     FlagsError: If the given class could not be found or if the first class
117 |     found with that name doesn't inherit from the expected superclass.
118 | 
119 |   Returns:
120 |     True if a class was found that matches the given constraints.
121 |   """
122 |   candidates = [getattr(module, flag_value, None) for module in modules]
123 |   for candidate in candidates:
124 |     if not candidate:
125 |       continue
126 |     if not issubclass(candidate, expected_superclass):
127 |       raise flags.FlagsError("%s '%s' doesn't inherit from %s." %
128 |                              (category, flag_value,
129 |                               expected_superclass.__name__))
130 |     return True
131 |   raise flags.FlagsError("Unable to find %s '%s'." % (category, flag_value))
132 | 
133 | def get_input_data_tensors(reader,
134 |                            data_pattern,
135 |                            batch_size=1000,
136 |                            num_epochs=None,
137 |                            num_readers=1):
138 |   """Creates the section of the graph which reads the training data.
139 | 
140 |   Args:
141 |     reader: A class which parses the training data.
142 |     data_pattern: A 'glob' style path to the data files.
143 |     batch_size: How many examples to process at a time.
144 |     num_epochs: How many passes to make over the training data. Set to 'None'
145 |                 to run indefinitely.
146 |     num_readers: How many I/O threads to use.
147 | 
148 |   Returns:
149 |     A tuple containing the features tensor, labels tensor, and optionally a
150 |     tensor containing the number of frames per video. The exact dimensions
151 |     depend on the reader being used.
152 | 
153 |   Raises:
154 |     IOError: If no files matching the given pattern were found.
155 |   """
156 |   logging.info("Using batch size of " + str(batch_size) + " for training.")
157 |   with tf.name_scope("train_input"):
158 |     files = gfile.Glob(data_pattern)
159 |     if not files:
160 |       raise IOError("Unable to find training files. data_pattern='" +
161 |                     data_pattern + "'.")
162 |     logging.info("Number of training files: %s.", str(len(files)))
163 |     filename_queue = tf.train.string_input_producer(
164 |         files, num_epochs=num_epochs, shuffle=True)
165 |     training_data = [
166 |         reader.prepare_reader(filename_queue) for _ in range(num_readers)
167 |     ]
168 | 
169 |     return tf.train.shuffle_batch_join(
170 |         training_data,
171 |         batch_size=batch_size,
172 |         capacity=batch_size * 5,
173 |         min_after_dequeue=batch_size,
174 |         allow_smaller_final_batch=True,
175 |         enqueue_many=True)
176 | 
177 | 
178 | def find_class_by_name(name, modules):
179 |   """Searches the provided modules for the named class and returns it."""
180 |   modules = [getattr(module, name, None) for module in modules]
181 |   return next(a for a in modules if a)
182 | 
183 | def build_graph(reader,
184 |                 model,
185 |                 train_data_pattern,
186 |                 label_loss_fn=losses.CrossEntropyLoss(),
187 |                 batch_size=1000,
188 |                 base_learning_rate=0.01,
189 |                 learning_rate_decay_examples=1000000,
190 |                 learning_rate_decay=0.95,
191 |                 optimizer_class=tf.train.AdamOptimizer,
192 |                 clip_gradient_norm=1.0,
193 |                 regularization_penalty=1,
194 |                 num_readers=1,
195 |                 num_epochs=None):
196 |   """Creates the Tensorflow graph.
197 | 
198 |   This will only be called once in the life of
199 |   a training model, because after the graph is created the model will be
200 |   restored from a meta graph file rather than being recreated.
201 | 
202 |   Args:
203 |     reader: The data file reader. It should inherit from BaseReader.
204 |     model: The core model (e.g. logistic or neural net). It should inherit
205 |            from BaseModel.
206 |     train_data_pattern: glob path to the training data files.
207 |     label_loss_fn: What kind of loss to apply to the model. It should inherit
208 |                 from BaseLoss.
209 |     batch_size: How many examples to process at a time.
210 |     base_learning_rate: What learning rate to initialize the optimizer with.
211 |     optimizer_class: Which optimization algorithm to use.
212 |     clip_gradient_norm: Magnitude of the gradient to clip to.
213 |     regularization_penalty: How much weight to give the regularization loss
214 |                             compared to the label loss.
215 |     num_readers: How many threads to use for I/O operations.
216 |     num_epochs: How many passes to make over the data. 'None' means an
217 |                 unlimited number of passes.
218 |   """
219 | 
220 |   global_step = tf.Variable(0, trainable=False, name="global_step")
221 | 
222 |   local_device_protos = device_lib.list_local_devices()
223 |   gpus = [x.name for x in local_device_protos if x.device_type == 'GPU']
224 |   num_gpus = len(gpus)
225 | 
226 |   if num_gpus > 0:
227 |     logging.info("Using the following GPUs to train: " + str(gpus))
228 |     num_towers = num_gpus
229 |     device_string = '/gpu:%d'
230 |   else:
231 |     logging.info("No GPUs found. Training on CPU.")
232 |     num_towers = 1
233 |     device_string = '/cpu:%d'
234 | 
235 |   learning_rate = tf.train.exponential_decay(
236 |       base_learning_rate,
237 |       global_step * batch_size * num_towers,
238 |       learning_rate_decay_examples,
239 |       learning_rate_decay,
240 |       staircase=True)
241 |   tf.summary.scalar('learning_rate', learning_rate)
242 | 
243 |   optimizer = optimizer_class(learning_rate)
244 |   unused_video_id, model_input_raw, labels_batch, num_frames = (
245 |       get_input_data_tensors(
246 |           reader,
247 |           train_data_pattern,
248 |           batch_size=batch_size * num_towers,
249 |           num_readers=num_readers,
250 |           num_epochs=num_epochs))
251 |   tf.summary.histogram("model/input_raw", model_input_raw)
252 | 
253 |   feature_dim = len(model_input_raw.get_shape()) - 1
254 | 
255 |   model_input = tf.nn.l2_normalize(model_input_raw, feature_dim)
256 | 
257 |   tower_inputs = tf.split(model_input, num_towers)
258 |   tower_labels = tf.split(labels_batch, num_towers)
259 |   tower_num_frames = tf.split(num_frames, num_towers)
260 |   tower_gradients = []
261 |   tower_predictions = []
262 |   tower_label_losses = []
263 |   tower_reg_losses = []
264 |   for i in range(num_towers):
265 |     # For some reason these 'with' statements can't be combined onto the same
266 |     # line. They have to be nested.
267 |     with tf.device(device_string % i):
268 |       with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)):
269 |         with (slim.arg_scope([slim.model_variable, slim.variable], device="/cpu:0" if num_gpus!=1 else "/gpu:0")):
270 |           result = model.create_model(
271 |             tower_inputs[i],
272 |             num_frames=tower_num_frames[i],
273 |             vocab_size=reader.num_classes,
274 |             labels=tower_labels[i])
275 |           for variable in slim.get_model_variables():
276 |             tf.summary.histogram(variable.op.name, variable)
277 | 
278 |           predictions = result["predictions"]
279 |           tower_predictions.append(predictions)
280 | 
281 |           if "loss" in result.keys():
282 |             label_loss = result["loss"]
283 |           else:
284 |             label_loss = label_loss_fn.calculate_loss(predictions, tower_labels[i])
285 | 
286 |           if "regularization_loss" in result.keys():
287 |             reg_loss = result["regularization_loss"]
288 |           else:
289 |             reg_loss = tf.constant(0.0)
290 | 
291 |           reg_losses = tf.losses.get_regularization_losses()
292 |           if reg_losses:
293 |             reg_loss += tf.add_n(reg_losses)
294 | 
295 |           tower_reg_losses.append(reg_loss)
296 | 
297 |           # Adds update_ops (e.g., moving average updates in batch normalization) as
298 |           # a dependency to the train_op.
299 |           update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
300 |           if "update_ops" in result.keys():
301 |             update_ops += result["update_ops"]
302 |           if update_ops:
303 |             with tf.control_dependencies(update_ops):
304 |               barrier = tf.no_op(name="gradient_barrier")
305 |               with tf.control_dependencies([barrier]):
306 |                 label_loss = tf.identity(label_loss)
307 | 
308 |           tower_label_losses.append(label_loss)
309 | 
310 |           # Incorporate the L2 weight penalties etc.
311 |           final_loss = regularization_penalty * reg_loss + label_loss
312 |           gradients = optimizer.compute_gradients(final_loss,
313 |               colocate_gradients_with_ops=False)
314 |           tower_gradients.append(gradients)
315 |   label_loss = tf.reduce_mean(tf.stack(tower_label_losses))
316 |   tf.summary.scalar("label_loss", label_loss)
317 |   if regularization_penalty != 0:
318 |     reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses))
319 |     tf.summary.scalar("reg_loss", reg_loss)
320 |   merged_gradients = utils.combine_gradients(tower_gradients)
321 | 
322 |   if clip_gradient_norm > 0:
323 |     with tf.name_scope('clip_grads'):
324 |       merged_gradients = utils.clip_gradient_norms(merged_gradients, clip_gradient_norm)
325 | 
326 |   train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step)
327 | 
328 |   tf.add_to_collection("global_step", global_step)
329 |   tf.add_to_collection("loss", label_loss)
330 |   tf.add_to_collection("predictions", tf.concat(tower_predictions, 0))
331 |   tf.add_to_collection("input_batch_raw", model_input_raw)
332 |   tf.add_to_collection("input_batch", model_input)
333 |   tf.add_to_collection("num_frames", num_frames)
334 |   tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32))
335 |   tf.add_to_collection("train_op", train_op)
336 | 
337 | 
338 | class Trainer(object):
339 |   """A Trainer to train a Tensorflow graph."""
340 | 
341 |   def __init__(self, cluster, task, train_dir, model, reader, model_exporter,
342 |                log_device_placement=True, max_steps=None,
343 |                export_model_steps=1000):
344 |     """"Creates a Trainer.
345 | 
346 |     Args:
347 |       cluster: A tf.train.ClusterSpec if the execution is distributed.
348 |         None otherwise.
349 |       task: A TaskSpec describing the job type and the task index.
350 |     """
351 | 
352 |     self.cluster = cluster
353 |     self.task = task
354 |     self.is_master = (task.type == "master" and task.index == 0)
355 |     self.train_dir = train_dir
356 |     self.config = tf.ConfigProto(
357 |         allow_soft_placement=True,log_device_placement=log_device_placement)
358 |     self.model = model
359 |     self.reader = reader
360 |     self.model_exporter = model_exporter
361 |     self.max_steps = max_steps
362 |     self.max_steps_reached = False
363 |     self.export_model_steps = export_model_steps
364 |     self.last_model_export_step = 0
365 | 
366 | #     if self.is_master and self.task.index > 0:
367 | #       raise StandardError("%s: Only one replica of master expected",
368 | #                           task_as_string(self.task))
369 | 
370 |   def run(self, start_new_model=False):
371 |     """Performs training on the currently defined Tensorflow graph.
372 | 
373 |     Returns:
374 |       A tuple of the training Hit@1 and the training PERR.
375 |     """
376 |     if self.is_master and start_new_model:
377 |       self.remove_training_directory(self.train_dir)
378 | 
379 |     target, device_fn = self.start_server_if_distributed()
380 | 
381 |     meta_filename = self.get_meta_filename(start_new_model, self.train_dir)
382 | 
383 |     with tf.Graph().as_default() as graph:
384 | 
385 |       if meta_filename:
386 |         saver = self.recover_model(meta_filename)
387 | 
388 |       with tf.device(device_fn):
389 |         if not meta_filename:
390 |           saver = self.build_model(self.model, self.reader)
391 | 
392 |         global_step = tf.get_collection("global_step")[0]
393 |         loss = tf.get_collection("loss")[0]
394 |         predictions = tf.get_collection("predictions")[0]
395 |         labels = tf.get_collection("labels")[0]
396 |         train_op = tf.get_collection("train_op")[0]
397 |         init_op = tf.global_variables_initializer()
398 | 
399 |     sv = tf.train.Supervisor(
400 |         graph,
401 |         logdir=self.train_dir,
402 |         init_op=init_op,
403 |         is_chief=self.is_master,
404 |         global_step=global_step,
405 |         save_model_secs=15 * 60,
406 |         save_summaries_secs=120,
407 |         saver=saver)
408 | 
409 |     logging.info("%s: Starting managed session.", task_as_string(self.task))
410 |     with sv.managed_session(target, config=self.config) as sess:
411 |       try:
412 |         logging.info("%s: Entering training loop.", task_as_string(self.task))
413 |         while (not sv.should_stop()) and (not self.max_steps_reached):
414 |           batch_start_time = time.time()
415 |           _, global_step_val, loss_val, predictions_val, labels_val = sess.run(
416 |               [train_op, global_step, loss, predictions, labels])
417 |           seconds_per_batch = time.time() - batch_start_time
418 |           examples_per_second = labels_val.shape[0] / seconds_per_batch
419 | 
420 |           if self.max_steps and self.max_steps <= global_step_val:
421 |             self.max_steps_reached = True
422 | 
423 |           if self.is_master and global_step_val % 10 == 0 and self.train_dir:
424 |             eval_start_time = time.time()
425 |             hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val)
426 |             perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val,
427 |                                                                       labels_val)
428 |             gap = eval_util.calculate_gap(predictions_val, labels_val)
429 |             eval_end_time = time.time()
430 |             eval_time = eval_end_time - eval_start_time
431 | 
432 |             logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) +
433 |               " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " +
434 |               ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) +
435 |               " GAP: " + ("%.2f" % gap))
436 | 
437 |             sv.summary_writer.add_summary(
438 |                 utils.MakeSummary("model/Training_Hit@1", hit_at_one),
439 |                 global_step_val)
440 |             sv.summary_writer.add_summary(
441 |                 utils.MakeSummary("model/Training_Perr", perr), global_step_val)
442 |             sv.summary_writer.add_summary(
443 |                 utils.MakeSummary("model/Training_GAP", gap), global_step_val)
444 |             sv.summary_writer.add_summary(
445 |                 utils.MakeSummary("global_step/Examples/Second",
446 |                                   examples_per_second), global_step_val)
447 |             sv.summary_writer.flush()
448 | 
449 |             # Exporting the model every x steps
450 |             time_to_export = ((self.last_model_export_step == 0) or
451 |                 (global_step_val - self.last_model_export_step
452 |                  >= self.export_model_steps))
453 | 
454 |             if self.is_master and time_to_export:
455 |               self.export_model(global_step_val, sv.saver, sv.save_path, sess)
456 |               self.last_model_export_step = global_step_val
457 |           else:
458 |             logging.info("training step " + str(global_step_val) + " | Loss: " +
459 |               ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second))
460 |       except tf.errors.OutOfRangeError:
461 |         logging.info("%s: Done training -- epoch limit reached.",
462 |                      task_as_string(self.task))
463 | 
464 |     logging.info("%s: Exited training loop.", task_as_string(self.task))
465 |     sv.Stop()
466 | 
467 |   def export_model(self, global_step_val, saver, save_path, session):
468 | 
469 |     # If the model has already been exported at this step, return.
470 |     if global_step_val == self.last_model_export_step:
471 |       return
472 | 
473 |     last_checkpoint = saver.save(session, save_path, global_step_val)
474 | 
475 |     model_dir = "{0}/export/step_{1}".format(self.train_dir, global_step_val)
476 |     logging.info("%s: Exporting the model at step %s to %s.",
477 |                  task_as_string(self.task), global_step_val, model_dir)
478 | 
479 |     self.model_exporter.export_model(
480 |         model_dir=model_dir,
481 |         global_step_val=global_step_val,
482 |         last_checkpoint=last_checkpoint)
483 | 
484 |   def start_server_if_distributed(self):
485 |     """Starts a server if the execution is distributed."""
486 | 
487 |     if self.cluster:
488 |       logging.info("%s: Starting trainer within cluster %s.",
489 |                    task_as_string(self.task), self.cluster.as_dict())
490 |       server = start_server(self.cluster, self.task)
491 |       target = server.target
492 |       device_fn = tf.train.replica_device_setter(
493 |           ps_device="/job:ps",
494 |           worker_device="/job:%s/task:%d" % (self.task.type, self.task.index),
495 |           cluster=self.cluster)
496 |     else:
497 |       target = ""
498 |       device_fn = ""
499 |     return (target, device_fn)
500 | 
501 |   def remove_training_directory(self, train_dir):
502 |     """Removes the training directory."""
503 |     try:
504 |       logging.info(
505 |           "%s: Removing existing train directory.",
506 |           task_as_string(self.task))
507 |       gfile.DeleteRecursively(train_dir)
508 |     except:
509 |       logging.error(
510 |           "%s: Failed to delete directory " + train_dir +
511 |           " when starting a new model. Please delete it manually and" +
512 |           " try again.", task_as_string(self.task))
513 | 
514 |   def get_meta_filename(self, start_new_model, train_dir):
515 |     if start_new_model:
516 |       logging.info("%s: Flag 'start_new_model' is set. Building a new model.",
517 |                    task_as_string(self.task))
518 |       return None
519 | 
520 |     latest_checkpoint = tf.train.latest_checkpoint(train_dir)
521 |     if not latest_checkpoint:
522 |       logging.info("%s: No checkpoint file found. Building a new model.",
523 |                    task_as_string(self.task))
524 |       return None
525 | 
526 |     meta_filename = latest_checkpoint + ".meta"
527 |     if not gfile.Exists(meta_filename):
528 |       logging.info("%s: No meta graph file found. Building a new model.",
529 |                      task_as_string(self.task))
530 |       return None
531 |     else:
532 |       return meta_filename
533 | 
534 |   def recover_model(self, meta_filename):
535 |     logging.info("%s: Restoring from meta graph file %s",
536 |                  task_as_string(self.task), meta_filename)
537 |     return tf.train.import_meta_graph(meta_filename)
538 | 
539 |   def build_model(self, model, reader):
540 |     """Find the model and build the graph."""
541 | 
542 |     label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])()
543 |     optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train])
544 | 
545 |     build_graph(reader=reader,
546 |                  model=model,
547 |                  optimizer_class=optimizer_class,
548 |                  clip_gradient_norm=FLAGS.clip_gradient_norm,
549 |                  train_data_pattern=FLAGS.train_data_pattern,
550 |                  label_loss_fn=label_loss_fn,
551 |                  base_learning_rate=FLAGS.base_learning_rate,
552 |                  learning_rate_decay=FLAGS.learning_rate_decay,
553 |                  learning_rate_decay_examples=FLAGS.learning_rate_decay_examples,
554 |                  regularization_penalty=FLAGS.regularization_penalty,
555 |                  num_readers=FLAGS.num_readers,
556 |                  batch_size=FLAGS.batch_size,
557 |                  num_epochs=FLAGS.num_epochs)
558 | 
559 |     return tf.train.Saver(max_to_keep=0, keep_checkpoint_every_n_hours=0.25)
560 | 
561 | 
562 | def get_reader():
563 |   # Convert feature_names and feature_sizes to lists of values.
564 |   feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes(
565 |       FLAGS.feature_names, FLAGS.feature_sizes)
566 |   num_classes = FLAGS.num_classes
567 | 
568 |   if FLAGS.frame_features:
569 |     reader = readers.YT8MFrameFeatureReader(
570 |         num_classes=num_classes,
571 |         feature_names=feature_names, feature_sizes=feature_sizes)
572 |   else:
573 |     reader = readers.YT8MAggregatedFeatureReader(
574 |         num_classes=num_classes,
575 |         feature_names=feature_names, feature_sizes=feature_sizes)
576 | 
577 |   return reader
578 | 
579 | 
580 | class ParameterServer(object):
581 |   """A parameter server to serve variables in a distributed execution."""
582 | 
583 |   def __init__(self, cluster, task):
584 |     """Creates a ParameterServer.
585 | 
586 |     Args:
587 |       cluster: A tf.train.ClusterSpec if the execution is distributed.
588 |         None otherwise.
589 |       task: A TaskSpec describing the job type and the task index.
590 |     """
591 | 
592 |     self.cluster = cluster
593 |     self.task = task
594 | 
595 |   def run(self):
596 |     """Starts the parameter server."""
597 | 
598 |     logging.info("%s: Starting parameter server within cluster %s.",
599 |                  task_as_string(self.task), self.cluster.as_dict())
600 |     server = start_server(self.cluster, self.task)
601 |     server.join()
602 | 
603 | 
604 | def start_server(cluster, task):
605 |   """Creates a Server.
606 | 
607 |   Args:
608 |     cluster: A tf.train.ClusterSpec if the execution is distributed.
609 |       None otherwise.
610 |     task: A TaskSpec describing the job type and the task index.
611 |   """
612 | 
613 |   if not task.type:
614 |     raise ValueError("%s: The task type must be specified." %
615 |                      task_as_string(task))
616 |   if task.index is None:
617 |     raise ValueError("%s: The task index must be specified." %
618 |                      task_as_string(task))
619 | 
620 |   # Create and start a server.
621 |   return tf.train.Server(
622 |       tf.train.ClusterSpec(cluster),
623 |       protocol="grpc",
624 |       job_name=task.type,
625 |       task_index=task.index)
626 | 
627 | def task_as_string(task):
628 |   return "/job:%s/task:%s" % (task.type, task.index)
629 | 
630 | def main(unused_argv):
631 |   # Load the environment.
632 |   env = json.loads(os.environ.get("TF_CONFIG", "{}"))
633 | 
634 |   # Load the cluster data from the environment.
635 |   cluster_data = env.get("cluster", None)
636 |   cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
637 | 
638 |   # Load the task data from the environment.
639 |   task_data = env.get("task", None) or {"type": "master", "index": 0}
640 |   task = type("TaskSpec", (object,), task_data)
641 | 
642 |   # Logging the version.
643 |   logging.set_verbosity(tf.logging.INFO)
644 |   logging.info("%s: Tensorflow version: %s.",
645 |                task_as_string(task), tf.__version__)
646 | 
647 |   # Dispatch to a master, a worker, or a parameter server.
648 |   if not cluster or task.type == "master" or task.type == "worker":
649 |     model = find_class_by_name(FLAGS.model,
650 |         [frame_level_models, video_level_models])()
651 | 
652 |     reader = get_reader()
653 | 
654 |     model_exporter = export_model.ModelExporter(
655 |         frame_features=FLAGS.frame_features,
656 |         model=model,
657 |         reader=reader)
658 | 
659 |     Trainer(cluster, task, FLAGS.train_dir, model, reader, model_exporter,
660 |             FLAGS.log_device_placement, FLAGS.max_steps,
661 |             FLAGS.export_model_steps).run(start_new_model=FLAGS.start_new_model)
662 | 
663 |   elif task.type == "ps":
664 |     ParameterServer(cluster, task).run()
665 |   else:
666 |     raise ValueError("%s: Invalid task_type: %s." %
667 |                      (task_as_string(task), task.type))
668 | 
669 | if __name__ == "__main__":
670 |   app.run()
671 | 


--------------------------------------------------------------------------------